af5f9ea25cd84b6c327d58a09e9ee787fc974290ff2d7d5dfe22b54aad11d08d
Browse files- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp +75 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu +145 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/csrc/vision.cpp +117 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/deform_conv.py +514 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/losses.py +133 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/mask_ops.py +275 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/nms.py +144 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/roi_align.py +74 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/roi_align_rotated.py +100 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/rotated_boxes.py +21 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/shape_spec.py +18 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/wrappers.py +162 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/model_zoo/__init__.py +10 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/model_zoo/model_zoo.py +213 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/__init__.py +64 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/anchor_generator.py +386 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/__init__.py +20 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/backbone.py +74 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/build.py +33 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/fpn.py +268 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/mvit.py +448 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/regnet.py +452 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/resnet.py +694 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/swin.py +695 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/utils.py +186 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/vit.py +524 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/box_regression.py +369 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/matcher.py +127 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/__init__.py +16 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/build.py +24 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/dense_detector.py +294 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/fcos.py +328 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/panoptic_fpn.py +269 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/rcnn.py +341 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/retinanet.py +439 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/semantic_seg.py +267 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/mmdet_wrapper.py +273 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/poolers.py +263 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/postprocessing.py +100 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/proposal_generator/__init__.py +5 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/proposal_generator/build.py +24 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/proposal_generator/proposal_utils.py +205 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/proposal_generator/rpn.py +533 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/proposal_generator/rrpn.py +209 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/roi_heads/__init__.py +29 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/roi_heads/box_head.py +118 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/roi_heads/cascade_rcnn.py +299 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/roi_heads/fast_rcnn.py +569 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/roi_heads/keypoint_head.py +272 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/roi_heads/mask_head.py +298 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
#include "../box_iou_rotated/box_iou_rotated_utils.h"
|
3 |
+
#include "nms_rotated.h"
|
4 |
+
|
5 |
+
namespace detectron2 {
|
6 |
+
|
7 |
+
template <typename scalar_t>
|
8 |
+
at::Tensor nms_rotated_cpu_kernel(
|
9 |
+
const at::Tensor& dets,
|
10 |
+
const at::Tensor& scores,
|
11 |
+
const double iou_threshold) {
|
12 |
+
// nms_rotated_cpu_kernel is modified from torchvision's nms_cpu_kernel,
|
13 |
+
// however, the code in this function is much shorter because
|
14 |
+
// we delegate the IoU computation for rotated boxes to
|
15 |
+
// the single_box_iou_rotated function in box_iou_rotated_utils.h
|
16 |
+
AT_ASSERTM(dets.device().is_cpu(), "dets must be a CPU tensor");
|
17 |
+
AT_ASSERTM(scores.device().is_cpu(), "scores must be a CPU tensor");
|
18 |
+
AT_ASSERTM(
|
19 |
+
dets.scalar_type() == scores.scalar_type(),
|
20 |
+
"dets should have the same type as scores");
|
21 |
+
|
22 |
+
if (dets.numel() == 0) {
|
23 |
+
return at::empty({0}, dets.options().dtype(at::kLong));
|
24 |
+
}
|
25 |
+
|
26 |
+
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
|
27 |
+
|
28 |
+
auto ndets = dets.size(0);
|
29 |
+
at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte));
|
30 |
+
at::Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong));
|
31 |
+
|
32 |
+
auto suppressed = suppressed_t.data_ptr<uint8_t>();
|
33 |
+
auto keep = keep_t.data_ptr<int64_t>();
|
34 |
+
auto order = order_t.data_ptr<int64_t>();
|
35 |
+
|
36 |
+
int64_t num_to_keep = 0;
|
37 |
+
|
38 |
+
for (int64_t _i = 0; _i < ndets; _i++) {
|
39 |
+
auto i = order[_i];
|
40 |
+
if (suppressed[i] == 1) {
|
41 |
+
continue;
|
42 |
+
}
|
43 |
+
|
44 |
+
keep[num_to_keep++] = i;
|
45 |
+
|
46 |
+
for (int64_t _j = _i + 1; _j < ndets; _j++) {
|
47 |
+
auto j = order[_j];
|
48 |
+
if (suppressed[j] == 1) {
|
49 |
+
continue;
|
50 |
+
}
|
51 |
+
|
52 |
+
auto ovr = single_box_iou_rotated<scalar_t>(
|
53 |
+
dets[i].data_ptr<scalar_t>(), dets[j].data_ptr<scalar_t>());
|
54 |
+
if (ovr >= iou_threshold) {
|
55 |
+
suppressed[j] = 1;
|
56 |
+
}
|
57 |
+
}
|
58 |
+
}
|
59 |
+
return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);
|
60 |
+
}
|
61 |
+
|
62 |
+
at::Tensor nms_rotated_cpu(
|
63 |
+
// input must be contiguous
|
64 |
+
const at::Tensor& dets,
|
65 |
+
const at::Tensor& scores,
|
66 |
+
const double iou_threshold) {
|
67 |
+
auto result = at::empty({0}, dets.options());
|
68 |
+
|
69 |
+
AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms_rotated", [&] {
|
70 |
+
result = nms_rotated_cpu_kernel<scalar_t>(dets, scores, iou_threshold);
|
71 |
+
});
|
72 |
+
return result;
|
73 |
+
}
|
74 |
+
|
75 |
+
} // namespace detectron2
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu
ADDED
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
#include <ATen/ATen.h>
|
3 |
+
#include <ATen/cuda/CUDAContext.h>
|
4 |
+
#include <c10/cuda/CUDAGuard.h>
|
5 |
+
#include <ATen/cuda/CUDAApplyUtils.cuh>
|
6 |
+
#ifdef WITH_CUDA
|
7 |
+
#include "../box_iou_rotated/box_iou_rotated_utils.h"
|
8 |
+
#endif
|
9 |
+
// TODO avoid this when pytorch supports "same directory" hipification
|
10 |
+
#ifdef WITH_HIP
|
11 |
+
#include "box_iou_rotated/box_iou_rotated_utils.h"
|
12 |
+
#endif
|
13 |
+
|
14 |
+
using namespace detectron2;
|
15 |
+
|
16 |
+
namespace {
|
17 |
+
int const threadsPerBlock = sizeof(unsigned long long) * 8;
|
18 |
+
}
|
19 |
+
|
20 |
+
template <typename T>
|
21 |
+
__global__ void nms_rotated_cuda_kernel(
|
22 |
+
const int n_boxes,
|
23 |
+
const double iou_threshold,
|
24 |
+
const T* dev_boxes,
|
25 |
+
unsigned long long* dev_mask) {
|
26 |
+
// nms_rotated_cuda_kernel is modified from torchvision's nms_cuda_kernel
|
27 |
+
|
28 |
+
const int row_start = blockIdx.y;
|
29 |
+
const int col_start = blockIdx.x;
|
30 |
+
|
31 |
+
// if (row_start > col_start) return;
|
32 |
+
|
33 |
+
const int row_size =
|
34 |
+
min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
|
35 |
+
const int col_size =
|
36 |
+
min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
|
37 |
+
|
38 |
+
// Compared to nms_cuda_kernel, where each box is represented with 4 values
|
39 |
+
// (x1, y1, x2, y2), each rotated box is represented with 5 values
|
40 |
+
// (x_center, y_center, width, height, angle_degrees) here.
|
41 |
+
__shared__ T block_boxes[threadsPerBlock * 5];
|
42 |
+
if (threadIdx.x < col_size) {
|
43 |
+
block_boxes[threadIdx.x * 5 + 0] =
|
44 |
+
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
|
45 |
+
block_boxes[threadIdx.x * 5 + 1] =
|
46 |
+
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
|
47 |
+
block_boxes[threadIdx.x * 5 + 2] =
|
48 |
+
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
|
49 |
+
block_boxes[threadIdx.x * 5 + 3] =
|
50 |
+
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
|
51 |
+
block_boxes[threadIdx.x * 5 + 4] =
|
52 |
+
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
|
53 |
+
}
|
54 |
+
__syncthreads();
|
55 |
+
|
56 |
+
if (threadIdx.x < row_size) {
|
57 |
+
const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
|
58 |
+
const T* cur_box = dev_boxes + cur_box_idx * 5;
|
59 |
+
int i = 0;
|
60 |
+
unsigned long long t = 0;
|
61 |
+
int start = 0;
|
62 |
+
if (row_start == col_start) {
|
63 |
+
start = threadIdx.x + 1;
|
64 |
+
}
|
65 |
+
for (i = start; i < col_size; i++) {
|
66 |
+
// Instead of devIoU used by original horizontal nms, here
|
67 |
+
// we use the single_box_iou_rotated function from box_iou_rotated_utils.h
|
68 |
+
if (single_box_iou_rotated<T>(cur_box, block_boxes + i * 5) >
|
69 |
+
iou_threshold) {
|
70 |
+
t |= 1ULL << i;
|
71 |
+
}
|
72 |
+
}
|
73 |
+
const int col_blocks = at::cuda::ATenCeilDiv(n_boxes, threadsPerBlock);
|
74 |
+
dev_mask[cur_box_idx * col_blocks + col_start] = t;
|
75 |
+
}
|
76 |
+
}
|
77 |
+
|
78 |
+
namespace detectron2 {
|
79 |
+
|
80 |
+
at::Tensor nms_rotated_cuda(
|
81 |
+
// input must be contiguous
|
82 |
+
const at::Tensor& dets,
|
83 |
+
const at::Tensor& scores,
|
84 |
+
double iou_threshold) {
|
85 |
+
// using scalar_t = float;
|
86 |
+
AT_ASSERTM(dets.is_cuda(), "dets must be a CUDA tensor");
|
87 |
+
AT_ASSERTM(scores.is_cuda(), "scores must be a CUDA tensor");
|
88 |
+
at::cuda::CUDAGuard device_guard(dets.device());
|
89 |
+
|
90 |
+
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
|
91 |
+
auto dets_sorted = dets.index_select(0, order_t);
|
92 |
+
|
93 |
+
auto dets_num = dets.size(0);
|
94 |
+
|
95 |
+
const int col_blocks =
|
96 |
+
at::cuda::ATenCeilDiv(static_cast<int>(dets_num), threadsPerBlock);
|
97 |
+
|
98 |
+
at::Tensor mask =
|
99 |
+
at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));
|
100 |
+
|
101 |
+
dim3 blocks(col_blocks, col_blocks);
|
102 |
+
dim3 threads(threadsPerBlock);
|
103 |
+
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
104 |
+
|
105 |
+
AT_DISPATCH_FLOATING_TYPES(
|
106 |
+
dets_sorted.scalar_type(), "nms_rotated_kernel_cuda", [&] {
|
107 |
+
nms_rotated_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
|
108 |
+
dets_num,
|
109 |
+
iou_threshold,
|
110 |
+
dets_sorted.data_ptr<scalar_t>(),
|
111 |
+
(unsigned long long*)mask.data_ptr<int64_t>());
|
112 |
+
});
|
113 |
+
|
114 |
+
at::Tensor mask_cpu = mask.to(at::kCPU);
|
115 |
+
unsigned long long* mask_host =
|
116 |
+
(unsigned long long*)mask_cpu.data_ptr<int64_t>();
|
117 |
+
|
118 |
+
std::vector<unsigned long long> remv(col_blocks);
|
119 |
+
memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
|
120 |
+
|
121 |
+
at::Tensor keep =
|
122 |
+
at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
|
123 |
+
int64_t* keep_out = keep.data_ptr<int64_t>();
|
124 |
+
|
125 |
+
int num_to_keep = 0;
|
126 |
+
for (int i = 0; i < dets_num; i++) {
|
127 |
+
int nblock = i / threadsPerBlock;
|
128 |
+
int inblock = i % threadsPerBlock;
|
129 |
+
|
130 |
+
if (!(remv[nblock] & (1ULL << inblock))) {
|
131 |
+
keep_out[num_to_keep++] = i;
|
132 |
+
unsigned long long* p = mask_host + i * col_blocks;
|
133 |
+
for (int j = nblock; j < col_blocks; j++) {
|
134 |
+
remv[j] |= p[j];
|
135 |
+
}
|
136 |
+
}
|
137 |
+
}
|
138 |
+
|
139 |
+
AT_CUDA_CHECK(cudaGetLastError());
|
140 |
+
return order_t.index(
|
141 |
+
{keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)
|
142 |
+
.to(order_t.device(), keep.scalar_type())});
|
143 |
+
}
|
144 |
+
|
145 |
+
} // namespace detectron2
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/csrc/vision.cpp
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
|
3 |
+
#include <torch/extension.h>
|
4 |
+
#include "ROIAlignRotated/ROIAlignRotated.h"
|
5 |
+
#include "box_iou_rotated/box_iou_rotated.h"
|
6 |
+
#include "cocoeval/cocoeval.h"
|
7 |
+
#include "deformable/deform_conv.h"
|
8 |
+
#include "nms_rotated/nms_rotated.h"
|
9 |
+
|
10 |
+
namespace detectron2 {
|
11 |
+
|
12 |
+
#if defined(WITH_CUDA) || defined(WITH_HIP)
|
13 |
+
extern int get_cudart_version();
|
14 |
+
#endif
|
15 |
+
|
16 |
+
std::string get_cuda_version() {
|
17 |
+
#if defined(WITH_CUDA) || defined(WITH_HIP)
|
18 |
+
std::ostringstream oss;
|
19 |
+
|
20 |
+
#if defined(WITH_CUDA)
|
21 |
+
oss << "CUDA ";
|
22 |
+
#else
|
23 |
+
oss << "HIP ";
|
24 |
+
#endif
|
25 |
+
|
26 |
+
// copied from
|
27 |
+
// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
|
28 |
+
auto printCudaStyleVersion = [&](int v) {
|
29 |
+
oss << (v / 1000) << "." << (v / 10 % 100);
|
30 |
+
if (v % 10 != 0) {
|
31 |
+
oss << "." << (v % 10);
|
32 |
+
}
|
33 |
+
};
|
34 |
+
printCudaStyleVersion(get_cudart_version());
|
35 |
+
return oss.str();
|
36 |
+
#else // neither CUDA nor HIP
|
37 |
+
return std::string("not available");
|
38 |
+
#endif
|
39 |
+
}
|
40 |
+
|
41 |
+
bool has_cuda() {
|
42 |
+
#if defined(WITH_CUDA)
|
43 |
+
return true;
|
44 |
+
#else
|
45 |
+
return false;
|
46 |
+
#endif
|
47 |
+
}
|
48 |
+
|
49 |
+
// similar to
|
50 |
+
// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
|
51 |
+
std::string get_compiler_version() {
|
52 |
+
std::ostringstream ss;
|
53 |
+
#if defined(__GNUC__)
|
54 |
+
#ifndef __clang__
|
55 |
+
|
56 |
+
#if ((__GNUC__ <= 4) && (__GNUC_MINOR__ <= 8))
|
57 |
+
#error "GCC >= 4.9 is required!"
|
58 |
+
#endif
|
59 |
+
|
60 |
+
{ ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; }
|
61 |
+
#endif
|
62 |
+
#endif
|
63 |
+
|
64 |
+
#if defined(__clang_major__)
|
65 |
+
{
|
66 |
+
ss << "clang " << __clang_major__ << "." << __clang_minor__ << "."
|
67 |
+
<< __clang_patchlevel__;
|
68 |
+
}
|
69 |
+
#endif
|
70 |
+
|
71 |
+
#if defined(_MSC_VER)
|
72 |
+
{ ss << "MSVC " << _MSC_FULL_VER; }
|
73 |
+
#endif
|
74 |
+
return ss.str();
|
75 |
+
}
|
76 |
+
|
77 |
+
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
78 |
+
m.def("get_compiler_version", &get_compiler_version, "get_compiler_version");
|
79 |
+
m.def("get_cuda_version", &get_cuda_version, "get_cuda_version");
|
80 |
+
m.def("has_cuda", &has_cuda, "has_cuda");
|
81 |
+
|
82 |
+
m.def("deform_conv_forward", &deform_conv_forward, "deform_conv_forward");
|
83 |
+
m.def(
|
84 |
+
"deform_conv_backward_input",
|
85 |
+
&deform_conv_backward_input,
|
86 |
+
"deform_conv_backward_input");
|
87 |
+
m.def(
|
88 |
+
"deform_conv_backward_filter",
|
89 |
+
&deform_conv_backward_filter,
|
90 |
+
"deform_conv_backward_filter");
|
91 |
+
m.def(
|
92 |
+
"modulated_deform_conv_forward",
|
93 |
+
&modulated_deform_conv_forward,
|
94 |
+
"modulated_deform_conv_forward");
|
95 |
+
m.def(
|
96 |
+
"modulated_deform_conv_backward",
|
97 |
+
&modulated_deform_conv_backward,
|
98 |
+
"modulated_deform_conv_backward");
|
99 |
+
|
100 |
+
m.def("COCOevalAccumulate", &COCOeval::Accumulate, "COCOeval::Accumulate");
|
101 |
+
m.def(
|
102 |
+
"COCOevalEvaluateImages",
|
103 |
+
&COCOeval::EvaluateImages,
|
104 |
+
"COCOeval::EvaluateImages");
|
105 |
+
pybind11::class_<COCOeval::InstanceAnnotation>(m, "InstanceAnnotation")
|
106 |
+
.def(pybind11::init<uint64_t, double, double, bool, bool>());
|
107 |
+
pybind11::class_<COCOeval::ImageEvaluation>(m, "ImageEvaluation")
|
108 |
+
.def(pybind11::init<>());
|
109 |
+
}
|
110 |
+
|
111 |
+
TORCH_LIBRARY(detectron2, m) {
|
112 |
+
m.def("nms_rotated", &nms_rotated);
|
113 |
+
m.def("box_iou_rotated", &box_iou_rotated);
|
114 |
+
m.def("roi_align_rotated_forward", &ROIAlignRotated_forward);
|
115 |
+
m.def("roi_align_rotated_backward", &ROIAlignRotated_backward);
|
116 |
+
}
|
117 |
+
} // namespace detectron2
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/deform_conv.py
ADDED
@@ -0,0 +1,514 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
import math
|
3 |
+
from functools import lru_cache
|
4 |
+
import torch
|
5 |
+
from torch import nn
|
6 |
+
from torch.autograd import Function
|
7 |
+
from torch.autograd.function import once_differentiable
|
8 |
+
from torch.nn.modules.utils import _pair
|
9 |
+
from torchvision.ops import deform_conv2d
|
10 |
+
|
11 |
+
from annotator.oneformer.detectron2.utils.develop import create_dummy_class, create_dummy_func
|
12 |
+
|
13 |
+
from .wrappers import _NewEmptyTensorOp
|
14 |
+
|
15 |
+
|
16 |
+
class _DeformConv(Function):
|
17 |
+
@staticmethod
|
18 |
+
def forward(
|
19 |
+
ctx,
|
20 |
+
input,
|
21 |
+
offset,
|
22 |
+
weight,
|
23 |
+
stride=1,
|
24 |
+
padding=0,
|
25 |
+
dilation=1,
|
26 |
+
groups=1,
|
27 |
+
deformable_groups=1,
|
28 |
+
im2col_step=64,
|
29 |
+
):
|
30 |
+
if input is not None and input.dim() != 4:
|
31 |
+
raise ValueError(
|
32 |
+
"Expected 4D tensor as input, got {}D tensor instead.".format(input.dim())
|
33 |
+
)
|
34 |
+
ctx.stride = _pair(stride)
|
35 |
+
ctx.padding = _pair(padding)
|
36 |
+
ctx.dilation = _pair(dilation)
|
37 |
+
ctx.groups = groups
|
38 |
+
ctx.deformable_groups = deformable_groups
|
39 |
+
ctx.im2col_step = im2col_step
|
40 |
+
|
41 |
+
ctx.save_for_backward(input, offset, weight)
|
42 |
+
|
43 |
+
output = input.new_empty(
|
44 |
+
_DeformConv._output_size(input, weight, ctx.padding, ctx.dilation, ctx.stride)
|
45 |
+
)
|
46 |
+
|
47 |
+
ctx.bufs_ = [input.new_empty(0), input.new_empty(0)] # columns, ones
|
48 |
+
|
49 |
+
if not input.is_cuda:
|
50 |
+
# TODO: let torchvision support full features of our deformconv.
|
51 |
+
if deformable_groups != 1:
|
52 |
+
raise NotImplementedError(
|
53 |
+
"Deformable Conv with deformable_groups != 1 is not supported on CPUs!"
|
54 |
+
)
|
55 |
+
return deform_conv2d(
|
56 |
+
input, offset, weight, stride=stride, padding=padding, dilation=dilation
|
57 |
+
)
|
58 |
+
else:
|
59 |
+
cur_im2col_step = _DeformConv._cal_im2col_step(input.shape[0], ctx.im2col_step)
|
60 |
+
assert (input.shape[0] % cur_im2col_step) == 0, "im2col step must divide batchsize"
|
61 |
+
|
62 |
+
_C.deform_conv_forward(
|
63 |
+
input,
|
64 |
+
weight,
|
65 |
+
offset,
|
66 |
+
output,
|
67 |
+
ctx.bufs_[0],
|
68 |
+
ctx.bufs_[1],
|
69 |
+
weight.size(3),
|
70 |
+
weight.size(2),
|
71 |
+
ctx.stride[1],
|
72 |
+
ctx.stride[0],
|
73 |
+
ctx.padding[1],
|
74 |
+
ctx.padding[0],
|
75 |
+
ctx.dilation[1],
|
76 |
+
ctx.dilation[0],
|
77 |
+
ctx.groups,
|
78 |
+
ctx.deformable_groups,
|
79 |
+
cur_im2col_step,
|
80 |
+
)
|
81 |
+
return output
|
82 |
+
|
83 |
+
@staticmethod
|
84 |
+
@once_differentiable
|
85 |
+
def backward(ctx, grad_output):
|
86 |
+
input, offset, weight = ctx.saved_tensors
|
87 |
+
|
88 |
+
grad_input = grad_offset = grad_weight = None
|
89 |
+
|
90 |
+
if not grad_output.is_cuda:
|
91 |
+
raise NotImplementedError("Deformable Conv is not supported on CPUs!")
|
92 |
+
else:
|
93 |
+
cur_im2col_step = _DeformConv._cal_im2col_step(input.shape[0], ctx.im2col_step)
|
94 |
+
assert (input.shape[0] % cur_im2col_step) == 0, "im2col step must divide batchsize"
|
95 |
+
|
96 |
+
if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
|
97 |
+
grad_input = torch.zeros_like(input)
|
98 |
+
grad_offset = torch.zeros_like(offset)
|
99 |
+
_C.deform_conv_backward_input(
|
100 |
+
input,
|
101 |
+
offset,
|
102 |
+
grad_output,
|
103 |
+
grad_input,
|
104 |
+
grad_offset,
|
105 |
+
weight,
|
106 |
+
ctx.bufs_[0],
|
107 |
+
weight.size(3),
|
108 |
+
weight.size(2),
|
109 |
+
ctx.stride[1],
|
110 |
+
ctx.stride[0],
|
111 |
+
ctx.padding[1],
|
112 |
+
ctx.padding[0],
|
113 |
+
ctx.dilation[1],
|
114 |
+
ctx.dilation[0],
|
115 |
+
ctx.groups,
|
116 |
+
ctx.deformable_groups,
|
117 |
+
cur_im2col_step,
|
118 |
+
)
|
119 |
+
|
120 |
+
if ctx.needs_input_grad[2]:
|
121 |
+
grad_weight = torch.zeros_like(weight)
|
122 |
+
_C.deform_conv_backward_filter(
|
123 |
+
input,
|
124 |
+
offset,
|
125 |
+
grad_output,
|
126 |
+
grad_weight,
|
127 |
+
ctx.bufs_[0],
|
128 |
+
ctx.bufs_[1],
|
129 |
+
weight.size(3),
|
130 |
+
weight.size(2),
|
131 |
+
ctx.stride[1],
|
132 |
+
ctx.stride[0],
|
133 |
+
ctx.padding[1],
|
134 |
+
ctx.padding[0],
|
135 |
+
ctx.dilation[1],
|
136 |
+
ctx.dilation[0],
|
137 |
+
ctx.groups,
|
138 |
+
ctx.deformable_groups,
|
139 |
+
1,
|
140 |
+
cur_im2col_step,
|
141 |
+
)
|
142 |
+
|
143 |
+
return grad_input, grad_offset, grad_weight, None, None, None, None, None, None
|
144 |
+
|
145 |
+
@staticmethod
|
146 |
+
def _output_size(input, weight, padding, dilation, stride):
|
147 |
+
channels = weight.size(0)
|
148 |
+
output_size = (input.size(0), channels)
|
149 |
+
for d in range(input.dim() - 2):
|
150 |
+
in_size = input.size(d + 2)
|
151 |
+
pad = padding[d]
|
152 |
+
kernel = dilation[d] * (weight.size(d + 2) - 1) + 1
|
153 |
+
stride_ = stride[d]
|
154 |
+
output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1,)
|
155 |
+
if not all(map(lambda s: s > 0, output_size)):
|
156 |
+
raise ValueError(
|
157 |
+
"convolution input is too small (output would be {})".format(
|
158 |
+
"x".join(map(str, output_size))
|
159 |
+
)
|
160 |
+
)
|
161 |
+
return output_size
|
162 |
+
|
163 |
+
@staticmethod
|
164 |
+
@lru_cache(maxsize=128)
|
165 |
+
def _cal_im2col_step(input_size, default_size):
|
166 |
+
"""
|
167 |
+
Calculate proper im2col step size, which should be divisible by input_size and not larger
|
168 |
+
than prefer_size. Meanwhile the step size should be as large as possible to be more
|
169 |
+
efficient. So we choose the largest one among all divisors of input_size which are smaller
|
170 |
+
than prefer_size.
|
171 |
+
:param input_size: input batch size .
|
172 |
+
:param default_size: default preferred im2col step size.
|
173 |
+
:return: the largest proper step size.
|
174 |
+
"""
|
175 |
+
if input_size <= default_size:
|
176 |
+
return input_size
|
177 |
+
best_step = 1
|
178 |
+
for step in range(2, min(int(math.sqrt(input_size)) + 1, default_size)):
|
179 |
+
if input_size % step == 0:
|
180 |
+
if input_size // step <= default_size:
|
181 |
+
return input_size // step
|
182 |
+
best_step = step
|
183 |
+
|
184 |
+
return best_step
|
185 |
+
|
186 |
+
|
187 |
+
class _ModulatedDeformConv(Function):
|
188 |
+
@staticmethod
|
189 |
+
def forward(
|
190 |
+
ctx,
|
191 |
+
input,
|
192 |
+
offset,
|
193 |
+
mask,
|
194 |
+
weight,
|
195 |
+
bias=None,
|
196 |
+
stride=1,
|
197 |
+
padding=0,
|
198 |
+
dilation=1,
|
199 |
+
groups=1,
|
200 |
+
deformable_groups=1,
|
201 |
+
):
|
202 |
+
ctx.stride = stride
|
203 |
+
ctx.padding = padding
|
204 |
+
ctx.dilation = dilation
|
205 |
+
ctx.groups = groups
|
206 |
+
ctx.deformable_groups = deformable_groups
|
207 |
+
ctx.with_bias = bias is not None
|
208 |
+
if not ctx.with_bias:
|
209 |
+
bias = input.new_empty(1) # fake tensor
|
210 |
+
if not input.is_cuda:
|
211 |
+
raise NotImplementedError("Deformable Conv is not supported on CPUs!")
|
212 |
+
if (
|
213 |
+
weight.requires_grad
|
214 |
+
or mask.requires_grad
|
215 |
+
or offset.requires_grad
|
216 |
+
or input.requires_grad
|
217 |
+
):
|
218 |
+
ctx.save_for_backward(input, offset, mask, weight, bias)
|
219 |
+
output = input.new_empty(_ModulatedDeformConv._infer_shape(ctx, input, weight))
|
220 |
+
ctx._bufs = [input.new_empty(0), input.new_empty(0)]
|
221 |
+
_C.modulated_deform_conv_forward(
|
222 |
+
input,
|
223 |
+
weight,
|
224 |
+
bias,
|
225 |
+
ctx._bufs[0],
|
226 |
+
offset,
|
227 |
+
mask,
|
228 |
+
output,
|
229 |
+
ctx._bufs[1],
|
230 |
+
weight.shape[2],
|
231 |
+
weight.shape[3],
|
232 |
+
ctx.stride,
|
233 |
+
ctx.stride,
|
234 |
+
ctx.padding,
|
235 |
+
ctx.padding,
|
236 |
+
ctx.dilation,
|
237 |
+
ctx.dilation,
|
238 |
+
ctx.groups,
|
239 |
+
ctx.deformable_groups,
|
240 |
+
ctx.with_bias,
|
241 |
+
)
|
242 |
+
return output
|
243 |
+
|
244 |
+
@staticmethod
|
245 |
+
@once_differentiable
|
246 |
+
def backward(ctx, grad_output):
|
247 |
+
if not grad_output.is_cuda:
|
248 |
+
raise NotImplementedError("Deformable Conv is not supported on CPUs!")
|
249 |
+
input, offset, mask, weight, bias = ctx.saved_tensors
|
250 |
+
grad_input = torch.zeros_like(input)
|
251 |
+
grad_offset = torch.zeros_like(offset)
|
252 |
+
grad_mask = torch.zeros_like(mask)
|
253 |
+
grad_weight = torch.zeros_like(weight)
|
254 |
+
grad_bias = torch.zeros_like(bias)
|
255 |
+
_C.modulated_deform_conv_backward(
|
256 |
+
input,
|
257 |
+
weight,
|
258 |
+
bias,
|
259 |
+
ctx._bufs[0],
|
260 |
+
offset,
|
261 |
+
mask,
|
262 |
+
ctx._bufs[1],
|
263 |
+
grad_input,
|
264 |
+
grad_weight,
|
265 |
+
grad_bias,
|
266 |
+
grad_offset,
|
267 |
+
grad_mask,
|
268 |
+
grad_output,
|
269 |
+
weight.shape[2],
|
270 |
+
weight.shape[3],
|
271 |
+
ctx.stride,
|
272 |
+
ctx.stride,
|
273 |
+
ctx.padding,
|
274 |
+
ctx.padding,
|
275 |
+
ctx.dilation,
|
276 |
+
ctx.dilation,
|
277 |
+
ctx.groups,
|
278 |
+
ctx.deformable_groups,
|
279 |
+
ctx.with_bias,
|
280 |
+
)
|
281 |
+
if not ctx.with_bias:
|
282 |
+
grad_bias = None
|
283 |
+
|
284 |
+
return (
|
285 |
+
grad_input,
|
286 |
+
grad_offset,
|
287 |
+
grad_mask,
|
288 |
+
grad_weight,
|
289 |
+
grad_bias,
|
290 |
+
None,
|
291 |
+
None,
|
292 |
+
None,
|
293 |
+
None,
|
294 |
+
None,
|
295 |
+
)
|
296 |
+
|
297 |
+
@staticmethod
|
298 |
+
def _infer_shape(ctx, input, weight):
|
299 |
+
n = input.size(0)
|
300 |
+
channels_out = weight.size(0)
|
301 |
+
height, width = input.shape[2:4]
|
302 |
+
kernel_h, kernel_w = weight.shape[2:4]
|
303 |
+
height_out = (
|
304 |
+
height + 2 * ctx.padding - (ctx.dilation * (kernel_h - 1) + 1)
|
305 |
+
) // ctx.stride + 1
|
306 |
+
width_out = (
|
307 |
+
width + 2 * ctx.padding - (ctx.dilation * (kernel_w - 1) + 1)
|
308 |
+
) // ctx.stride + 1
|
309 |
+
return n, channels_out, height_out, width_out
|
310 |
+
|
311 |
+
|
312 |
+
deform_conv = _DeformConv.apply
|
313 |
+
modulated_deform_conv = _ModulatedDeformConv.apply
|
314 |
+
|
315 |
+
|
316 |
+
class DeformConv(nn.Module):
|
317 |
+
def __init__(
|
318 |
+
self,
|
319 |
+
in_channels,
|
320 |
+
out_channels,
|
321 |
+
kernel_size,
|
322 |
+
stride=1,
|
323 |
+
padding=0,
|
324 |
+
dilation=1,
|
325 |
+
groups=1,
|
326 |
+
deformable_groups=1,
|
327 |
+
bias=False,
|
328 |
+
norm=None,
|
329 |
+
activation=None,
|
330 |
+
):
|
331 |
+
"""
|
332 |
+
Deformable convolution from :paper:`deformconv`.
|
333 |
+
|
334 |
+
Arguments are similar to :class:`Conv2D`. Extra arguments:
|
335 |
+
|
336 |
+
Args:
|
337 |
+
deformable_groups (int): number of groups used in deformable convolution.
|
338 |
+
norm (nn.Module, optional): a normalization layer
|
339 |
+
activation (callable(Tensor) -> Tensor): a callable activation function
|
340 |
+
"""
|
341 |
+
super(DeformConv, self).__init__()
|
342 |
+
|
343 |
+
assert not bias
|
344 |
+
assert in_channels % groups == 0, "in_channels {} cannot be divisible by groups {}".format(
|
345 |
+
in_channels, groups
|
346 |
+
)
|
347 |
+
assert (
|
348 |
+
out_channels % groups == 0
|
349 |
+
), "out_channels {} cannot be divisible by groups {}".format(out_channels, groups)
|
350 |
+
|
351 |
+
self.in_channels = in_channels
|
352 |
+
self.out_channels = out_channels
|
353 |
+
self.kernel_size = _pair(kernel_size)
|
354 |
+
self.stride = _pair(stride)
|
355 |
+
self.padding = _pair(padding)
|
356 |
+
self.dilation = _pair(dilation)
|
357 |
+
self.groups = groups
|
358 |
+
self.deformable_groups = deformable_groups
|
359 |
+
self.norm = norm
|
360 |
+
self.activation = activation
|
361 |
+
|
362 |
+
self.weight = nn.Parameter(
|
363 |
+
torch.Tensor(out_channels, in_channels // self.groups, *self.kernel_size)
|
364 |
+
)
|
365 |
+
self.bias = None
|
366 |
+
|
367 |
+
nn.init.kaiming_uniform_(self.weight, nonlinearity="relu")
|
368 |
+
|
369 |
+
def forward(self, x, offset):
|
370 |
+
if x.numel() == 0:
|
371 |
+
# When input is empty, we want to return a empty tensor with "correct" shape,
|
372 |
+
# So that the following operations will not panic
|
373 |
+
# if they check for the shape of the tensor.
|
374 |
+
# This computes the height and width of the output tensor
|
375 |
+
output_shape = [
|
376 |
+
(i + 2 * p - (di * (k - 1) + 1)) // s + 1
|
377 |
+
for i, p, di, k, s in zip(
|
378 |
+
x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride
|
379 |
+
)
|
380 |
+
]
|
381 |
+
output_shape = [x.shape[0], self.weight.shape[0]] + output_shape
|
382 |
+
return _NewEmptyTensorOp.apply(x, output_shape)
|
383 |
+
|
384 |
+
x = deform_conv(
|
385 |
+
x,
|
386 |
+
offset,
|
387 |
+
self.weight,
|
388 |
+
self.stride,
|
389 |
+
self.padding,
|
390 |
+
self.dilation,
|
391 |
+
self.groups,
|
392 |
+
self.deformable_groups,
|
393 |
+
)
|
394 |
+
if self.norm is not None:
|
395 |
+
x = self.norm(x)
|
396 |
+
if self.activation is not None:
|
397 |
+
x = self.activation(x)
|
398 |
+
return x
|
399 |
+
|
400 |
+
def extra_repr(self):
|
401 |
+
tmpstr = "in_channels=" + str(self.in_channels)
|
402 |
+
tmpstr += ", out_channels=" + str(self.out_channels)
|
403 |
+
tmpstr += ", kernel_size=" + str(self.kernel_size)
|
404 |
+
tmpstr += ", stride=" + str(self.stride)
|
405 |
+
tmpstr += ", padding=" + str(self.padding)
|
406 |
+
tmpstr += ", dilation=" + str(self.dilation)
|
407 |
+
tmpstr += ", groups=" + str(self.groups)
|
408 |
+
tmpstr += ", deformable_groups=" + str(self.deformable_groups)
|
409 |
+
tmpstr += ", bias=False"
|
410 |
+
return tmpstr
|
411 |
+
|
412 |
+
|
413 |
+
class ModulatedDeformConv(nn.Module):
|
414 |
+
def __init__(
|
415 |
+
self,
|
416 |
+
in_channels,
|
417 |
+
out_channels,
|
418 |
+
kernel_size,
|
419 |
+
stride=1,
|
420 |
+
padding=0,
|
421 |
+
dilation=1,
|
422 |
+
groups=1,
|
423 |
+
deformable_groups=1,
|
424 |
+
bias=True,
|
425 |
+
norm=None,
|
426 |
+
activation=None,
|
427 |
+
):
|
428 |
+
"""
|
429 |
+
Modulated deformable convolution from :paper:`deformconv2`.
|
430 |
+
|
431 |
+
Arguments are similar to :class:`Conv2D`. Extra arguments:
|
432 |
+
|
433 |
+
Args:
|
434 |
+
deformable_groups (int): number of groups used in deformable convolution.
|
435 |
+
norm (nn.Module, optional): a normalization layer
|
436 |
+
activation (callable(Tensor) -> Tensor): a callable activation function
|
437 |
+
"""
|
438 |
+
super(ModulatedDeformConv, self).__init__()
|
439 |
+
self.in_channels = in_channels
|
440 |
+
self.out_channels = out_channels
|
441 |
+
self.kernel_size = _pair(kernel_size)
|
442 |
+
self.stride = stride
|
443 |
+
self.padding = padding
|
444 |
+
self.dilation = dilation
|
445 |
+
self.groups = groups
|
446 |
+
self.deformable_groups = deformable_groups
|
447 |
+
self.with_bias = bias
|
448 |
+
self.norm = norm
|
449 |
+
self.activation = activation
|
450 |
+
|
451 |
+
self.weight = nn.Parameter(
|
452 |
+
torch.Tensor(out_channels, in_channels // groups, *self.kernel_size)
|
453 |
+
)
|
454 |
+
if bias:
|
455 |
+
self.bias = nn.Parameter(torch.Tensor(out_channels))
|
456 |
+
else:
|
457 |
+
self.bias = None
|
458 |
+
|
459 |
+
nn.init.kaiming_uniform_(self.weight, nonlinearity="relu")
|
460 |
+
if self.bias is not None:
|
461 |
+
nn.init.constant_(self.bias, 0)
|
462 |
+
|
463 |
+
def forward(self, x, offset, mask):
|
464 |
+
if x.numel() == 0:
|
465 |
+
output_shape = [
|
466 |
+
(i + 2 * p - (di * (k - 1) + 1)) // s + 1
|
467 |
+
for i, p, di, k, s in zip(
|
468 |
+
x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride
|
469 |
+
)
|
470 |
+
]
|
471 |
+
output_shape = [x.shape[0], self.weight.shape[0]] + output_shape
|
472 |
+
return _NewEmptyTensorOp.apply(x, output_shape)
|
473 |
+
|
474 |
+
x = modulated_deform_conv(
|
475 |
+
x,
|
476 |
+
offset,
|
477 |
+
mask,
|
478 |
+
self.weight,
|
479 |
+
self.bias,
|
480 |
+
self.stride,
|
481 |
+
self.padding,
|
482 |
+
self.dilation,
|
483 |
+
self.groups,
|
484 |
+
self.deformable_groups,
|
485 |
+
)
|
486 |
+
if self.norm is not None:
|
487 |
+
x = self.norm(x)
|
488 |
+
if self.activation is not None:
|
489 |
+
x = self.activation(x)
|
490 |
+
return x
|
491 |
+
|
492 |
+
def extra_repr(self):
|
493 |
+
tmpstr = "in_channels=" + str(self.in_channels)
|
494 |
+
tmpstr += ", out_channels=" + str(self.out_channels)
|
495 |
+
tmpstr += ", kernel_size=" + str(self.kernel_size)
|
496 |
+
tmpstr += ", stride=" + str(self.stride)
|
497 |
+
tmpstr += ", padding=" + str(self.padding)
|
498 |
+
tmpstr += ", dilation=" + str(self.dilation)
|
499 |
+
tmpstr += ", groups=" + str(self.groups)
|
500 |
+
tmpstr += ", deformable_groups=" + str(self.deformable_groups)
|
501 |
+
tmpstr += ", bias=" + str(self.with_bias)
|
502 |
+
return tmpstr
|
503 |
+
|
504 |
+
|
505 |
+
try:
|
506 |
+
from annotator.oneformer.detectron2 import _C
|
507 |
+
except ImportError:
|
508 |
+
# TODO: register ops natively so there is no need to import _C.
|
509 |
+
_msg = "detectron2 is not compiled successfully, please build following the instructions!"
|
510 |
+
_args = ("detectron2._C", _msg)
|
511 |
+
DeformConv = create_dummy_class("DeformConv", *_args)
|
512 |
+
ModulatedDeformConv = create_dummy_class("ModulatedDeformConv", *_args)
|
513 |
+
deform_conv = create_dummy_func("deform_conv", *_args)
|
514 |
+
modulated_deform_conv = create_dummy_func("modulated_deform_conv", *_args)
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/losses.py
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import torch
|
3 |
+
|
4 |
+
|
5 |
+
def diou_loss(
|
6 |
+
boxes1: torch.Tensor,
|
7 |
+
boxes2: torch.Tensor,
|
8 |
+
reduction: str = "none",
|
9 |
+
eps: float = 1e-7,
|
10 |
+
) -> torch.Tensor:
|
11 |
+
"""
|
12 |
+
Distance Intersection over Union Loss (Zhaohui Zheng et. al)
|
13 |
+
https://arxiv.org/abs/1911.08287
|
14 |
+
Args:
|
15 |
+
boxes1, boxes2 (Tensor): box locations in XYXY format, shape (N, 4) or (4,).
|
16 |
+
reduction: 'none' | 'mean' | 'sum'
|
17 |
+
'none': No reduction will be applied to the output.
|
18 |
+
'mean': The output will be averaged.
|
19 |
+
'sum': The output will be summed.
|
20 |
+
eps (float): small number to prevent division by zero
|
21 |
+
"""
|
22 |
+
|
23 |
+
x1, y1, x2, y2 = boxes1.unbind(dim=-1)
|
24 |
+
x1g, y1g, x2g, y2g = boxes2.unbind(dim=-1)
|
25 |
+
|
26 |
+
# TODO: use torch._assert_async() when pytorch 1.8 support is dropped
|
27 |
+
assert (x2 >= x1).all(), "bad box: x1 larger than x2"
|
28 |
+
assert (y2 >= y1).all(), "bad box: y1 larger than y2"
|
29 |
+
|
30 |
+
# Intersection keypoints
|
31 |
+
xkis1 = torch.max(x1, x1g)
|
32 |
+
ykis1 = torch.max(y1, y1g)
|
33 |
+
xkis2 = torch.min(x2, x2g)
|
34 |
+
ykis2 = torch.min(y2, y2g)
|
35 |
+
|
36 |
+
intsct = torch.zeros_like(x1)
|
37 |
+
mask = (ykis2 > ykis1) & (xkis2 > xkis1)
|
38 |
+
intsct[mask] = (xkis2[mask] - xkis1[mask]) * (ykis2[mask] - ykis1[mask])
|
39 |
+
union = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g) - intsct + eps
|
40 |
+
iou = intsct / union
|
41 |
+
|
42 |
+
# smallest enclosing box
|
43 |
+
xc1 = torch.min(x1, x1g)
|
44 |
+
yc1 = torch.min(y1, y1g)
|
45 |
+
xc2 = torch.max(x2, x2g)
|
46 |
+
yc2 = torch.max(y2, y2g)
|
47 |
+
diag_len = ((xc2 - xc1) ** 2) + ((yc2 - yc1) ** 2) + eps
|
48 |
+
|
49 |
+
# centers of boxes
|
50 |
+
x_p = (x2 + x1) / 2
|
51 |
+
y_p = (y2 + y1) / 2
|
52 |
+
x_g = (x1g + x2g) / 2
|
53 |
+
y_g = (y1g + y2g) / 2
|
54 |
+
distance = ((x_p - x_g) ** 2) + ((y_p - y_g) ** 2)
|
55 |
+
|
56 |
+
# Eqn. (7)
|
57 |
+
loss = 1 - iou + (distance / diag_len)
|
58 |
+
if reduction == "mean":
|
59 |
+
loss = loss.mean() if loss.numel() > 0 else 0.0 * loss.sum()
|
60 |
+
elif reduction == "sum":
|
61 |
+
loss = loss.sum()
|
62 |
+
|
63 |
+
return loss
|
64 |
+
|
65 |
+
|
66 |
+
def ciou_loss(
|
67 |
+
boxes1: torch.Tensor,
|
68 |
+
boxes2: torch.Tensor,
|
69 |
+
reduction: str = "none",
|
70 |
+
eps: float = 1e-7,
|
71 |
+
) -> torch.Tensor:
|
72 |
+
"""
|
73 |
+
Complete Intersection over Union Loss (Zhaohui Zheng et. al)
|
74 |
+
https://arxiv.org/abs/1911.08287
|
75 |
+
Args:
|
76 |
+
boxes1, boxes2 (Tensor): box locations in XYXY format, shape (N, 4) or (4,).
|
77 |
+
reduction: 'none' | 'mean' | 'sum'
|
78 |
+
'none': No reduction will be applied to the output.
|
79 |
+
'mean': The output will be averaged.
|
80 |
+
'sum': The output will be summed.
|
81 |
+
eps (float): small number to prevent division by zero
|
82 |
+
"""
|
83 |
+
|
84 |
+
x1, y1, x2, y2 = boxes1.unbind(dim=-1)
|
85 |
+
x1g, y1g, x2g, y2g = boxes2.unbind(dim=-1)
|
86 |
+
|
87 |
+
# TODO: use torch._assert_async() when pytorch 1.8 support is dropped
|
88 |
+
assert (x2 >= x1).all(), "bad box: x1 larger than x2"
|
89 |
+
assert (y2 >= y1).all(), "bad box: y1 larger than y2"
|
90 |
+
|
91 |
+
# Intersection keypoints
|
92 |
+
xkis1 = torch.max(x1, x1g)
|
93 |
+
ykis1 = torch.max(y1, y1g)
|
94 |
+
xkis2 = torch.min(x2, x2g)
|
95 |
+
ykis2 = torch.min(y2, y2g)
|
96 |
+
|
97 |
+
intsct = torch.zeros_like(x1)
|
98 |
+
mask = (ykis2 > ykis1) & (xkis2 > xkis1)
|
99 |
+
intsct[mask] = (xkis2[mask] - xkis1[mask]) * (ykis2[mask] - ykis1[mask])
|
100 |
+
union = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g) - intsct + eps
|
101 |
+
iou = intsct / union
|
102 |
+
|
103 |
+
# smallest enclosing box
|
104 |
+
xc1 = torch.min(x1, x1g)
|
105 |
+
yc1 = torch.min(y1, y1g)
|
106 |
+
xc2 = torch.max(x2, x2g)
|
107 |
+
yc2 = torch.max(y2, y2g)
|
108 |
+
diag_len = ((xc2 - xc1) ** 2) + ((yc2 - yc1) ** 2) + eps
|
109 |
+
|
110 |
+
# centers of boxes
|
111 |
+
x_p = (x2 + x1) / 2
|
112 |
+
y_p = (y2 + y1) / 2
|
113 |
+
x_g = (x1g + x2g) / 2
|
114 |
+
y_g = (y1g + y2g) / 2
|
115 |
+
distance = ((x_p - x_g) ** 2) + ((y_p - y_g) ** 2)
|
116 |
+
|
117 |
+
# width and height of boxes
|
118 |
+
w_pred = x2 - x1
|
119 |
+
h_pred = y2 - y1
|
120 |
+
w_gt = x2g - x1g
|
121 |
+
h_gt = y2g - y1g
|
122 |
+
v = (4 / (math.pi**2)) * torch.pow((torch.atan(w_gt / h_gt) - torch.atan(w_pred / h_pred)), 2)
|
123 |
+
with torch.no_grad():
|
124 |
+
alpha = v / (1 - iou + v + eps)
|
125 |
+
|
126 |
+
# Eqn. (10)
|
127 |
+
loss = 1 - iou + (distance / diag_len) + alpha * v
|
128 |
+
if reduction == "mean":
|
129 |
+
loss = loss.mean() if loss.numel() > 0 else 0.0 * loss.sum()
|
130 |
+
elif reduction == "sum":
|
131 |
+
loss = loss.sum()
|
132 |
+
|
133 |
+
return loss
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/mask_ops.py
ADDED
@@ -0,0 +1,275 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
import numpy as np
|
3 |
+
from typing import Tuple
|
4 |
+
import torch
|
5 |
+
from PIL import Image
|
6 |
+
from torch.nn import functional as F
|
7 |
+
|
8 |
+
__all__ = ["paste_masks_in_image"]
|
9 |
+
|
10 |
+
|
11 |
+
BYTES_PER_FLOAT = 4
|
12 |
+
# TODO: This memory limit may be too much or too little. It would be better to
|
13 |
+
# determine it based on available resources.
|
14 |
+
GPU_MEM_LIMIT = 1024**3 # 1 GB memory limit
|
15 |
+
|
16 |
+
|
17 |
+
def _do_paste_mask(masks, boxes, img_h: int, img_w: int, skip_empty: bool = True):
|
18 |
+
"""
|
19 |
+
Args:
|
20 |
+
masks: N, 1, H, W
|
21 |
+
boxes: N, 4
|
22 |
+
img_h, img_w (int):
|
23 |
+
skip_empty (bool): only paste masks within the region that
|
24 |
+
tightly bound all boxes, and returns the results this region only.
|
25 |
+
An important optimization for CPU.
|
26 |
+
|
27 |
+
Returns:
|
28 |
+
if skip_empty == False, a mask of shape (N, img_h, img_w)
|
29 |
+
if skip_empty == True, a mask of shape (N, h', w'), and the slice
|
30 |
+
object for the corresponding region.
|
31 |
+
"""
|
32 |
+
# On GPU, paste all masks together (up to chunk size)
|
33 |
+
# by using the entire image to sample the masks
|
34 |
+
# Compared to pasting them one by one,
|
35 |
+
# this has more operations but is faster on COCO-scale dataset.
|
36 |
+
device = masks.device
|
37 |
+
|
38 |
+
if skip_empty and not torch.jit.is_scripting():
|
39 |
+
x0_int, y0_int = torch.clamp(boxes.min(dim=0).values.floor()[:2] - 1, min=0).to(
|
40 |
+
dtype=torch.int32
|
41 |
+
)
|
42 |
+
x1_int = torch.clamp(boxes[:, 2].max().ceil() + 1, max=img_w).to(dtype=torch.int32)
|
43 |
+
y1_int = torch.clamp(boxes[:, 3].max().ceil() + 1, max=img_h).to(dtype=torch.int32)
|
44 |
+
else:
|
45 |
+
x0_int, y0_int = 0, 0
|
46 |
+
x1_int, y1_int = img_w, img_h
|
47 |
+
x0, y0, x1, y1 = torch.split(boxes, 1, dim=1) # each is Nx1
|
48 |
+
|
49 |
+
N = masks.shape[0]
|
50 |
+
|
51 |
+
img_y = torch.arange(y0_int, y1_int, device=device, dtype=torch.float32) + 0.5
|
52 |
+
img_x = torch.arange(x0_int, x1_int, device=device, dtype=torch.float32) + 0.5
|
53 |
+
img_y = (img_y - y0) / (y1 - y0) * 2 - 1
|
54 |
+
img_x = (img_x - x0) / (x1 - x0) * 2 - 1
|
55 |
+
# img_x, img_y have shapes (N, w), (N, h)
|
56 |
+
|
57 |
+
gx = img_x[:, None, :].expand(N, img_y.size(1), img_x.size(1))
|
58 |
+
gy = img_y[:, :, None].expand(N, img_y.size(1), img_x.size(1))
|
59 |
+
grid = torch.stack([gx, gy], dim=3)
|
60 |
+
|
61 |
+
if not torch.jit.is_scripting():
|
62 |
+
if not masks.dtype.is_floating_point:
|
63 |
+
masks = masks.float()
|
64 |
+
img_masks = F.grid_sample(masks, grid.to(masks.dtype), align_corners=False)
|
65 |
+
|
66 |
+
if skip_empty and not torch.jit.is_scripting():
|
67 |
+
return img_masks[:, 0], (slice(y0_int, y1_int), slice(x0_int, x1_int))
|
68 |
+
else:
|
69 |
+
return img_masks[:, 0], ()
|
70 |
+
|
71 |
+
|
72 |
+
# Annotate boxes as Tensor (but not Boxes) in order to use scripting
|
73 |
+
@torch.jit.script_if_tracing
|
74 |
+
def paste_masks_in_image(
|
75 |
+
masks: torch.Tensor, boxes: torch.Tensor, image_shape: Tuple[int, int], threshold: float = 0.5
|
76 |
+
):
|
77 |
+
"""
|
78 |
+
Paste a set of masks that are of a fixed resolution (e.g., 28 x 28) into an image.
|
79 |
+
The location, height, and width for pasting each mask is determined by their
|
80 |
+
corresponding bounding boxes in boxes.
|
81 |
+
|
82 |
+
Note:
|
83 |
+
This is a complicated but more accurate implementation. In actual deployment, it is
|
84 |
+
often enough to use a faster but less accurate implementation.
|
85 |
+
See :func:`paste_mask_in_image_old` in this file for an alternative implementation.
|
86 |
+
|
87 |
+
Args:
|
88 |
+
masks (tensor): Tensor of shape (Bimg, Hmask, Wmask), where Bimg is the number of
|
89 |
+
detected object instances in the image and Hmask, Wmask are the mask width and mask
|
90 |
+
height of the predicted mask (e.g., Hmask = Wmask = 28). Values are in [0, 1].
|
91 |
+
boxes (Boxes or Tensor): A Boxes of length Bimg or Tensor of shape (Bimg, 4).
|
92 |
+
boxes[i] and masks[i] correspond to the same object instance.
|
93 |
+
image_shape (tuple): height, width
|
94 |
+
threshold (float): A threshold in [0, 1] for converting the (soft) masks to
|
95 |
+
binary masks.
|
96 |
+
|
97 |
+
Returns:
|
98 |
+
img_masks (Tensor): A tensor of shape (Bimg, Himage, Wimage), where Bimg is the
|
99 |
+
number of detected object instances and Himage, Wimage are the image width
|
100 |
+
and height. img_masks[i] is a binary mask for object instance i.
|
101 |
+
"""
|
102 |
+
|
103 |
+
assert masks.shape[-1] == masks.shape[-2], "Only square mask predictions are supported"
|
104 |
+
N = len(masks)
|
105 |
+
if N == 0:
|
106 |
+
return masks.new_empty((0,) + image_shape, dtype=torch.uint8)
|
107 |
+
if not isinstance(boxes, torch.Tensor):
|
108 |
+
boxes = boxes.tensor
|
109 |
+
device = boxes.device
|
110 |
+
assert len(boxes) == N, boxes.shape
|
111 |
+
|
112 |
+
img_h, img_w = image_shape
|
113 |
+
|
114 |
+
# The actual implementation split the input into chunks,
|
115 |
+
# and paste them chunk by chunk.
|
116 |
+
if device.type == "cpu" or torch.jit.is_scripting():
|
117 |
+
# CPU is most efficient when they are pasted one by one with skip_empty=True
|
118 |
+
# so that it performs minimal number of operations.
|
119 |
+
num_chunks = N
|
120 |
+
else:
|
121 |
+
# GPU benefits from parallelism for larger chunks, but may have memory issue
|
122 |
+
# int(img_h) because shape may be tensors in tracing
|
123 |
+
num_chunks = int(np.ceil(N * int(img_h) * int(img_w) * BYTES_PER_FLOAT / GPU_MEM_LIMIT))
|
124 |
+
assert (
|
125 |
+
num_chunks <= N
|
126 |
+
), "Default GPU_MEM_LIMIT in mask_ops.py is too small; try increasing it"
|
127 |
+
chunks = torch.chunk(torch.arange(N, device=device), num_chunks)
|
128 |
+
|
129 |
+
img_masks = torch.zeros(
|
130 |
+
N, img_h, img_w, device=device, dtype=torch.bool if threshold >= 0 else torch.uint8
|
131 |
+
)
|
132 |
+
for inds in chunks:
|
133 |
+
masks_chunk, spatial_inds = _do_paste_mask(
|
134 |
+
masks[inds, None, :, :], boxes[inds], img_h, img_w, skip_empty=device.type == "cpu"
|
135 |
+
)
|
136 |
+
|
137 |
+
if threshold >= 0:
|
138 |
+
masks_chunk = (masks_chunk >= threshold).to(dtype=torch.bool)
|
139 |
+
else:
|
140 |
+
# for visualization and debugging
|
141 |
+
masks_chunk = (masks_chunk * 255).to(dtype=torch.uint8)
|
142 |
+
|
143 |
+
if torch.jit.is_scripting(): # Scripting does not use the optimized codepath
|
144 |
+
img_masks[inds] = masks_chunk
|
145 |
+
else:
|
146 |
+
img_masks[(inds,) + spatial_inds] = masks_chunk
|
147 |
+
return img_masks
|
148 |
+
|
149 |
+
|
150 |
+
# The below are the original paste function (from Detectron1) which has
|
151 |
+
# larger quantization error.
|
152 |
+
# It is faster on CPU, while the aligned one is faster on GPU thanks to grid_sample.
|
153 |
+
|
154 |
+
|
155 |
+
def paste_mask_in_image_old(mask, box, img_h, img_w, threshold):
|
156 |
+
"""
|
157 |
+
Paste a single mask in an image.
|
158 |
+
This is a per-box implementation of :func:`paste_masks_in_image`.
|
159 |
+
This function has larger quantization error due to incorrect pixel
|
160 |
+
modeling and is not used any more.
|
161 |
+
|
162 |
+
Args:
|
163 |
+
mask (Tensor): A tensor of shape (Hmask, Wmask) storing the mask of a single
|
164 |
+
object instance. Values are in [0, 1].
|
165 |
+
box (Tensor): A tensor of shape (4, ) storing the x0, y0, x1, y1 box corners
|
166 |
+
of the object instance.
|
167 |
+
img_h, img_w (int): Image height and width.
|
168 |
+
threshold (float): Mask binarization threshold in [0, 1].
|
169 |
+
|
170 |
+
Returns:
|
171 |
+
im_mask (Tensor):
|
172 |
+
The resized and binarized object mask pasted into the original
|
173 |
+
image plane (a tensor of shape (img_h, img_w)).
|
174 |
+
"""
|
175 |
+
# Conversion from continuous box coordinates to discrete pixel coordinates
|
176 |
+
# via truncation (cast to int32). This determines which pixels to paste the
|
177 |
+
# mask onto.
|
178 |
+
box = box.to(dtype=torch.int32) # Continuous to discrete coordinate conversion
|
179 |
+
# An example (1D) box with continuous coordinates (x0=0.7, x1=4.3) will map to
|
180 |
+
# a discrete coordinates (x0=0, x1=4). Note that box is mapped to 5 = x1 - x0 + 1
|
181 |
+
# pixels (not x1 - x0 pixels).
|
182 |
+
samples_w = box[2] - box[0] + 1 # Number of pixel samples, *not* geometric width
|
183 |
+
samples_h = box[3] - box[1] + 1 # Number of pixel samples, *not* geometric height
|
184 |
+
|
185 |
+
# Resample the mask from it's original grid to the new samples_w x samples_h grid
|
186 |
+
mask = Image.fromarray(mask.cpu().numpy())
|
187 |
+
mask = mask.resize((samples_w, samples_h), resample=Image.BILINEAR)
|
188 |
+
mask = np.array(mask, copy=False)
|
189 |
+
|
190 |
+
if threshold >= 0:
|
191 |
+
mask = np.array(mask > threshold, dtype=np.uint8)
|
192 |
+
mask = torch.from_numpy(mask)
|
193 |
+
else:
|
194 |
+
# for visualization and debugging, we also
|
195 |
+
# allow it to return an unmodified mask
|
196 |
+
mask = torch.from_numpy(mask * 255).to(torch.uint8)
|
197 |
+
|
198 |
+
im_mask = torch.zeros((img_h, img_w), dtype=torch.uint8)
|
199 |
+
x_0 = max(box[0], 0)
|
200 |
+
x_1 = min(box[2] + 1, img_w)
|
201 |
+
y_0 = max(box[1], 0)
|
202 |
+
y_1 = min(box[3] + 1, img_h)
|
203 |
+
|
204 |
+
im_mask[y_0:y_1, x_0:x_1] = mask[
|
205 |
+
(y_0 - box[1]) : (y_1 - box[1]), (x_0 - box[0]) : (x_1 - box[0])
|
206 |
+
]
|
207 |
+
return im_mask
|
208 |
+
|
209 |
+
|
210 |
+
# Our pixel modeling requires extrapolation for any continuous
|
211 |
+
# coordinate < 0.5 or > length - 0.5. When sampling pixels on the masks,
|
212 |
+
# we would like this extrapolation to be an interpolation between boundary values and zero,
|
213 |
+
# instead of using absolute zero or boundary values.
|
214 |
+
# Therefore `paste_mask_in_image_old` is often used with zero padding around the masks like this:
|
215 |
+
# masks, scale = pad_masks(masks[:, 0, :, :], 1)
|
216 |
+
# boxes = scale_boxes(boxes.tensor, scale)
|
217 |
+
|
218 |
+
|
219 |
+
def pad_masks(masks, padding):
|
220 |
+
"""
|
221 |
+
Args:
|
222 |
+
masks (tensor): A tensor of shape (B, M, M) representing B masks.
|
223 |
+
padding (int): Number of cells to pad on all sides.
|
224 |
+
|
225 |
+
Returns:
|
226 |
+
The padded masks and the scale factor of the padding size / original size.
|
227 |
+
"""
|
228 |
+
B = masks.shape[0]
|
229 |
+
M = masks.shape[-1]
|
230 |
+
pad2 = 2 * padding
|
231 |
+
scale = float(M + pad2) / M
|
232 |
+
padded_masks = masks.new_zeros((B, M + pad2, M + pad2))
|
233 |
+
padded_masks[:, padding:-padding, padding:-padding] = masks
|
234 |
+
return padded_masks, scale
|
235 |
+
|
236 |
+
|
237 |
+
def scale_boxes(boxes, scale):
|
238 |
+
"""
|
239 |
+
Args:
|
240 |
+
boxes (tensor): A tensor of shape (B, 4) representing B boxes with 4
|
241 |
+
coords representing the corners x0, y0, x1, y1,
|
242 |
+
scale (float): The box scaling factor.
|
243 |
+
|
244 |
+
Returns:
|
245 |
+
Scaled boxes.
|
246 |
+
"""
|
247 |
+
w_half = (boxes[:, 2] - boxes[:, 0]) * 0.5
|
248 |
+
h_half = (boxes[:, 3] - boxes[:, 1]) * 0.5
|
249 |
+
x_c = (boxes[:, 2] + boxes[:, 0]) * 0.5
|
250 |
+
y_c = (boxes[:, 3] + boxes[:, 1]) * 0.5
|
251 |
+
|
252 |
+
w_half *= scale
|
253 |
+
h_half *= scale
|
254 |
+
|
255 |
+
scaled_boxes = torch.zeros_like(boxes)
|
256 |
+
scaled_boxes[:, 0] = x_c - w_half
|
257 |
+
scaled_boxes[:, 2] = x_c + w_half
|
258 |
+
scaled_boxes[:, 1] = y_c - h_half
|
259 |
+
scaled_boxes[:, 3] = y_c + h_half
|
260 |
+
return scaled_boxes
|
261 |
+
|
262 |
+
|
263 |
+
@torch.jit.script_if_tracing
|
264 |
+
def _paste_masks_tensor_shape(
|
265 |
+
masks: torch.Tensor,
|
266 |
+
boxes: torch.Tensor,
|
267 |
+
image_shape: Tuple[torch.Tensor, torch.Tensor],
|
268 |
+
threshold: float = 0.5,
|
269 |
+
):
|
270 |
+
"""
|
271 |
+
A wrapper of paste_masks_in_image where image_shape is Tensor.
|
272 |
+
During tracing, shapes might be tensors instead of ints. The Tensor->int
|
273 |
+
conversion should be scripted rather than traced.
|
274 |
+
"""
|
275 |
+
return paste_masks_in_image(masks, boxes, (int(image_shape[0]), int(image_shape[1])), threshold)
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/nms.py
ADDED
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
3 |
+
|
4 |
+
import torch
|
5 |
+
from torchvision.ops import boxes as box_ops
|
6 |
+
from torchvision.ops import nms # noqa . for compatibility
|
7 |
+
|
8 |
+
|
9 |
+
def batched_nms(
|
10 |
+
boxes: torch.Tensor, scores: torch.Tensor, idxs: torch.Tensor, iou_threshold: float
|
11 |
+
):
|
12 |
+
"""
|
13 |
+
Same as torchvision.ops.boxes.batched_nms, but with float().
|
14 |
+
"""
|
15 |
+
assert boxes.shape[-1] == 4
|
16 |
+
# Note: Torchvision already has a strategy (https://github.com/pytorch/vision/issues/1311)
|
17 |
+
# to decide whether to use coordinate trick or for loop to implement batched_nms. So we
|
18 |
+
# just call it directly.
|
19 |
+
# Fp16 does not have enough range for batched NMS, so adding float().
|
20 |
+
return box_ops.batched_nms(boxes.float(), scores, idxs, iou_threshold)
|
21 |
+
|
22 |
+
|
23 |
+
# Note: this function (nms_rotated) might be moved into
|
24 |
+
# torchvision/ops/boxes.py in the future
|
25 |
+
def nms_rotated(boxes: torch.Tensor, scores: torch.Tensor, iou_threshold: float):
|
26 |
+
"""
|
27 |
+
Performs non-maximum suppression (NMS) on the rotated boxes according
|
28 |
+
to their intersection-over-union (IoU).
|
29 |
+
|
30 |
+
Rotated NMS iteratively removes lower scoring rotated boxes which have an
|
31 |
+
IoU greater than iou_threshold with another (higher scoring) rotated box.
|
32 |
+
|
33 |
+
Note that RotatedBox (5, 3, 4, 2, -90) covers exactly the same region as
|
34 |
+
RotatedBox (5, 3, 4, 2, 90) does, and their IoU will be 1. However, they
|
35 |
+
can be representing completely different objects in certain tasks, e.g., OCR.
|
36 |
+
|
37 |
+
As for the question of whether rotated-NMS should treat them as faraway boxes
|
38 |
+
even though their IOU is 1, it depends on the application and/or ground truth annotation.
|
39 |
+
|
40 |
+
As an extreme example, consider a single character v and the square box around it.
|
41 |
+
|
42 |
+
If the angle is 0 degree, the object (text) would be read as 'v';
|
43 |
+
|
44 |
+
If the angle is 90 degrees, the object (text) would become '>';
|
45 |
+
|
46 |
+
If the angle is 180 degrees, the object (text) would become '^';
|
47 |
+
|
48 |
+
If the angle is 270/-90 degrees, the object (text) would become '<'
|
49 |
+
|
50 |
+
All of these cases have IoU of 1 to each other, and rotated NMS that only
|
51 |
+
uses IoU as criterion would only keep one of them with the highest score -
|
52 |
+
which, practically, still makes sense in most cases because typically
|
53 |
+
only one of theses orientations is the correct one. Also, it does not matter
|
54 |
+
as much if the box is only used to classify the object (instead of transcribing
|
55 |
+
them with a sequential OCR recognition model) later.
|
56 |
+
|
57 |
+
On the other hand, when we use IoU to filter proposals that are close to the
|
58 |
+
ground truth during training, we should definitely take the angle into account if
|
59 |
+
we know the ground truth is labeled with the strictly correct orientation (as in,
|
60 |
+
upside-down words are annotated with -180 degrees even though they can be covered
|
61 |
+
with a 0/90/-90 degree box, etc.)
|
62 |
+
|
63 |
+
The way the original dataset is annotated also matters. For example, if the dataset
|
64 |
+
is a 4-point polygon dataset that does not enforce ordering of vertices/orientation,
|
65 |
+
we can estimate a minimum rotated bounding box to this polygon, but there's no way
|
66 |
+
we can tell the correct angle with 100% confidence (as shown above, there could be 4 different
|
67 |
+
rotated boxes, with angles differed by 90 degrees to each other, covering the exactly
|
68 |
+
same region). In that case we have to just use IoU to determine the box
|
69 |
+
proximity (as many detection benchmarks (even for text) do) unless there're other
|
70 |
+
assumptions we can make (like width is always larger than height, or the object is not
|
71 |
+
rotated by more than 90 degrees CCW/CW, etc.)
|
72 |
+
|
73 |
+
In summary, not considering angles in rotated NMS seems to be a good option for now,
|
74 |
+
but we should be aware of its implications.
|
75 |
+
|
76 |
+
Args:
|
77 |
+
boxes (Tensor[N, 5]): Rotated boxes to perform NMS on. They are expected to be in
|
78 |
+
(x_center, y_center, width, height, angle_degrees) format.
|
79 |
+
scores (Tensor[N]): Scores for each one of the rotated boxes
|
80 |
+
iou_threshold (float): Discards all overlapping rotated boxes with IoU < iou_threshold
|
81 |
+
|
82 |
+
Returns:
|
83 |
+
keep (Tensor): int64 tensor with the indices of the elements that have been kept
|
84 |
+
by Rotated NMS, sorted in decreasing order of scores
|
85 |
+
"""
|
86 |
+
return torch.ops.detectron2.nms_rotated(boxes, scores, iou_threshold)
|
87 |
+
|
88 |
+
|
89 |
+
# Note: this function (batched_nms_rotated) might be moved into
|
90 |
+
# torchvision/ops/boxes.py in the future
|
91 |
+
|
92 |
+
|
93 |
+
@torch.jit.script_if_tracing
|
94 |
+
def batched_nms_rotated(
|
95 |
+
boxes: torch.Tensor, scores: torch.Tensor, idxs: torch.Tensor, iou_threshold: float
|
96 |
+
):
|
97 |
+
"""
|
98 |
+
Performs non-maximum suppression in a batched fashion.
|
99 |
+
|
100 |
+
Each index value correspond to a category, and NMS
|
101 |
+
will not be applied between elements of different categories.
|
102 |
+
|
103 |
+
Args:
|
104 |
+
boxes (Tensor[N, 5]):
|
105 |
+
boxes where NMS will be performed. They
|
106 |
+
are expected to be in (x_ctr, y_ctr, width, height, angle_degrees) format
|
107 |
+
scores (Tensor[N]):
|
108 |
+
scores for each one of the boxes
|
109 |
+
idxs (Tensor[N]):
|
110 |
+
indices of the categories for each one of the boxes.
|
111 |
+
iou_threshold (float):
|
112 |
+
discards all overlapping boxes
|
113 |
+
with IoU < iou_threshold
|
114 |
+
|
115 |
+
Returns:
|
116 |
+
Tensor:
|
117 |
+
int64 tensor with the indices of the elements that have been kept
|
118 |
+
by NMS, sorted in decreasing order of scores
|
119 |
+
"""
|
120 |
+
assert boxes.shape[-1] == 5
|
121 |
+
|
122 |
+
if boxes.numel() == 0:
|
123 |
+
return torch.empty((0,), dtype=torch.int64, device=boxes.device)
|
124 |
+
boxes = boxes.float() # fp16 does not have enough range for batched NMS
|
125 |
+
# Strategy: in order to perform NMS independently per class,
|
126 |
+
# we add an offset to all the boxes. The offset is dependent
|
127 |
+
# only on the class idx, and is large enough so that boxes
|
128 |
+
# from different classes do not overlap
|
129 |
+
|
130 |
+
# Note that batched_nms in torchvision/ops/boxes.py only uses max_coordinate,
|
131 |
+
# which won't handle negative coordinates correctly.
|
132 |
+
# Here by using min_coordinate we can make sure the negative coordinates are
|
133 |
+
# correctly handled.
|
134 |
+
max_coordinate = (
|
135 |
+
torch.max(boxes[:, 0], boxes[:, 1]) + torch.max(boxes[:, 2], boxes[:, 3]) / 2
|
136 |
+
).max()
|
137 |
+
min_coordinate = (
|
138 |
+
torch.min(boxes[:, 0], boxes[:, 1]) - torch.max(boxes[:, 2], boxes[:, 3]) / 2
|
139 |
+
).min()
|
140 |
+
offsets = idxs.to(boxes) * (max_coordinate - min_coordinate + 1)
|
141 |
+
boxes_for_nms = boxes.clone() # avoid modifying the original values in boxes
|
142 |
+
boxes_for_nms[:, :2] += offsets[:, None]
|
143 |
+
keep = nms_rotated(boxes_for_nms, scores, iou_threshold)
|
144 |
+
return keep
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/roi_align.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
from torch import nn
|
3 |
+
from torchvision.ops import roi_align
|
4 |
+
|
5 |
+
|
6 |
+
# NOTE: torchvision's RoIAlign has a different default aligned=False
|
7 |
+
class ROIAlign(nn.Module):
|
8 |
+
def __init__(self, output_size, spatial_scale, sampling_ratio, aligned=True):
|
9 |
+
"""
|
10 |
+
Args:
|
11 |
+
output_size (tuple): h, w
|
12 |
+
spatial_scale (float): scale the input boxes by this number
|
13 |
+
sampling_ratio (int): number of inputs samples to take for each output
|
14 |
+
sample. 0 to take samples densely.
|
15 |
+
aligned (bool): if False, use the legacy implementation in
|
16 |
+
Detectron. If True, align the results more perfectly.
|
17 |
+
|
18 |
+
Note:
|
19 |
+
The meaning of aligned=True:
|
20 |
+
|
21 |
+
Given a continuous coordinate c, its two neighboring pixel indices (in our
|
22 |
+
pixel model) are computed by floor(c - 0.5) and ceil(c - 0.5). For example,
|
23 |
+
c=1.3 has pixel neighbors with discrete indices [0] and [1] (which are sampled
|
24 |
+
from the underlying signal at continuous coordinates 0.5 and 1.5). But the original
|
25 |
+
roi_align (aligned=False) does not subtract the 0.5 when computing neighboring
|
26 |
+
pixel indices and therefore it uses pixels with a slightly incorrect alignment
|
27 |
+
(relative to our pixel model) when performing bilinear interpolation.
|
28 |
+
|
29 |
+
With `aligned=True`,
|
30 |
+
we first appropriately scale the ROI and then shift it by -0.5
|
31 |
+
prior to calling roi_align. This produces the correct neighbors; see
|
32 |
+
detectron2/tests/test_roi_align.py for verification.
|
33 |
+
|
34 |
+
The difference does not make a difference to the model's performance if
|
35 |
+
ROIAlign is used together with conv layers.
|
36 |
+
"""
|
37 |
+
super().__init__()
|
38 |
+
self.output_size = output_size
|
39 |
+
self.spatial_scale = spatial_scale
|
40 |
+
self.sampling_ratio = sampling_ratio
|
41 |
+
self.aligned = aligned
|
42 |
+
|
43 |
+
from torchvision import __version__
|
44 |
+
|
45 |
+
version = tuple(int(x) for x in __version__.split(".")[:2])
|
46 |
+
# https://github.com/pytorch/vision/pull/2438
|
47 |
+
assert version >= (0, 7), "Require torchvision >= 0.7"
|
48 |
+
|
49 |
+
def forward(self, input, rois):
|
50 |
+
"""
|
51 |
+
Args:
|
52 |
+
input: NCHW images
|
53 |
+
rois: Bx5 boxes. First column is the index into N. The other 4 columns are xyxy.
|
54 |
+
"""
|
55 |
+
assert rois.dim() == 2 and rois.size(1) == 5
|
56 |
+
if input.is_quantized:
|
57 |
+
input = input.dequantize()
|
58 |
+
return roi_align(
|
59 |
+
input,
|
60 |
+
rois.to(dtype=input.dtype),
|
61 |
+
self.output_size,
|
62 |
+
self.spatial_scale,
|
63 |
+
self.sampling_ratio,
|
64 |
+
self.aligned,
|
65 |
+
)
|
66 |
+
|
67 |
+
def __repr__(self):
|
68 |
+
tmpstr = self.__class__.__name__ + "("
|
69 |
+
tmpstr += "output_size=" + str(self.output_size)
|
70 |
+
tmpstr += ", spatial_scale=" + str(self.spatial_scale)
|
71 |
+
tmpstr += ", sampling_ratio=" + str(self.sampling_ratio)
|
72 |
+
tmpstr += ", aligned=" + str(self.aligned)
|
73 |
+
tmpstr += ")"
|
74 |
+
return tmpstr
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/roi_align_rotated.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
import torch
|
3 |
+
from torch import nn
|
4 |
+
from torch.autograd import Function
|
5 |
+
from torch.autograd.function import once_differentiable
|
6 |
+
from torch.nn.modules.utils import _pair
|
7 |
+
|
8 |
+
|
9 |
+
class _ROIAlignRotated(Function):
|
10 |
+
@staticmethod
|
11 |
+
def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio):
|
12 |
+
ctx.save_for_backward(roi)
|
13 |
+
ctx.output_size = _pair(output_size)
|
14 |
+
ctx.spatial_scale = spatial_scale
|
15 |
+
ctx.sampling_ratio = sampling_ratio
|
16 |
+
ctx.input_shape = input.size()
|
17 |
+
output = torch.ops.detectron2.roi_align_rotated_forward(
|
18 |
+
input, roi, spatial_scale, output_size[0], output_size[1], sampling_ratio
|
19 |
+
)
|
20 |
+
return output
|
21 |
+
|
22 |
+
@staticmethod
|
23 |
+
@once_differentiable
|
24 |
+
def backward(ctx, grad_output):
|
25 |
+
(rois,) = ctx.saved_tensors
|
26 |
+
output_size = ctx.output_size
|
27 |
+
spatial_scale = ctx.spatial_scale
|
28 |
+
sampling_ratio = ctx.sampling_ratio
|
29 |
+
bs, ch, h, w = ctx.input_shape
|
30 |
+
grad_input = torch.ops.detectron2.roi_align_rotated_backward(
|
31 |
+
grad_output,
|
32 |
+
rois,
|
33 |
+
spatial_scale,
|
34 |
+
output_size[0],
|
35 |
+
output_size[1],
|
36 |
+
bs,
|
37 |
+
ch,
|
38 |
+
h,
|
39 |
+
w,
|
40 |
+
sampling_ratio,
|
41 |
+
)
|
42 |
+
return grad_input, None, None, None, None, None
|
43 |
+
|
44 |
+
|
45 |
+
roi_align_rotated = _ROIAlignRotated.apply
|
46 |
+
|
47 |
+
|
48 |
+
class ROIAlignRotated(nn.Module):
|
49 |
+
def __init__(self, output_size, spatial_scale, sampling_ratio):
|
50 |
+
"""
|
51 |
+
Args:
|
52 |
+
output_size (tuple): h, w
|
53 |
+
spatial_scale (float): scale the input boxes by this number
|
54 |
+
sampling_ratio (int): number of inputs samples to take for each output
|
55 |
+
sample. 0 to take samples densely.
|
56 |
+
|
57 |
+
Note:
|
58 |
+
ROIAlignRotated supports continuous coordinate by default:
|
59 |
+
Given a continuous coordinate c, its two neighboring pixel indices (in our
|
60 |
+
pixel model) are computed by floor(c - 0.5) and ceil(c - 0.5). For example,
|
61 |
+
c=1.3 has pixel neighbors with discrete indices [0] and [1] (which are sampled
|
62 |
+
from the underlying signal at continuous coordinates 0.5 and 1.5).
|
63 |
+
"""
|
64 |
+
super(ROIAlignRotated, self).__init__()
|
65 |
+
self.output_size = output_size
|
66 |
+
self.spatial_scale = spatial_scale
|
67 |
+
self.sampling_ratio = sampling_ratio
|
68 |
+
|
69 |
+
def forward(self, input, rois):
|
70 |
+
"""
|
71 |
+
Args:
|
72 |
+
input: NCHW images
|
73 |
+
rois: Bx6 boxes. First column is the index into N.
|
74 |
+
The other 5 columns are (x_ctr, y_ctr, width, height, angle_degrees).
|
75 |
+
"""
|
76 |
+
assert rois.dim() == 2 and rois.size(1) == 6
|
77 |
+
orig_dtype = input.dtype
|
78 |
+
if orig_dtype == torch.float16:
|
79 |
+
input = input.float()
|
80 |
+
rois = rois.float()
|
81 |
+
output_size = _pair(self.output_size)
|
82 |
+
|
83 |
+
# Scripting for Autograd is currently unsupported.
|
84 |
+
# This is a quick fix without having to rewrite code on the C++ side
|
85 |
+
if torch.jit.is_scripting() or torch.jit.is_tracing():
|
86 |
+
return torch.ops.detectron2.roi_align_rotated_forward(
|
87 |
+
input, rois, self.spatial_scale, output_size[0], output_size[1], self.sampling_ratio
|
88 |
+
).to(dtype=orig_dtype)
|
89 |
+
|
90 |
+
return roi_align_rotated(
|
91 |
+
input, rois, self.output_size, self.spatial_scale, self.sampling_ratio
|
92 |
+
).to(dtype=orig_dtype)
|
93 |
+
|
94 |
+
def __repr__(self):
|
95 |
+
tmpstr = self.__class__.__name__ + "("
|
96 |
+
tmpstr += "output_size=" + str(self.output_size)
|
97 |
+
tmpstr += ", spatial_scale=" + str(self.spatial_scale)
|
98 |
+
tmpstr += ", sampling_ratio=" + str(self.sampling_ratio)
|
99 |
+
tmpstr += ")"
|
100 |
+
return tmpstr
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/rotated_boxes.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
from __future__ import absolute_import, division, print_function, unicode_literals
|
3 |
+
import torch
|
4 |
+
|
5 |
+
|
6 |
+
def pairwise_iou_rotated(boxes1, boxes2):
|
7 |
+
"""
|
8 |
+
Return intersection-over-union (Jaccard index) of boxes.
|
9 |
+
|
10 |
+
Both sets of boxes are expected to be in
|
11 |
+
(x_center, y_center, width, height, angle) format.
|
12 |
+
|
13 |
+
Arguments:
|
14 |
+
boxes1 (Tensor[N, 5])
|
15 |
+
boxes2 (Tensor[M, 5])
|
16 |
+
|
17 |
+
Returns:
|
18 |
+
iou (Tensor[N, M]): the NxM matrix containing the pairwise
|
19 |
+
IoU values for every element in boxes1 and boxes2
|
20 |
+
"""
|
21 |
+
return torch.ops.detectron2.box_iou_rotated(boxes1, boxes2)
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/shape_spec.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
3 |
+
from dataclasses import dataclass
|
4 |
+
from typing import Optional
|
5 |
+
|
6 |
+
|
7 |
+
@dataclass
|
8 |
+
class ShapeSpec:
|
9 |
+
"""
|
10 |
+
A simple structure that contains basic shape specification about a tensor.
|
11 |
+
It is often used as the auxiliary inputs/outputs of models,
|
12 |
+
to complement the lack of shape inference ability among pytorch modules.
|
13 |
+
"""
|
14 |
+
|
15 |
+
channels: Optional[int] = None
|
16 |
+
height: Optional[int] = None
|
17 |
+
width: Optional[int] = None
|
18 |
+
stride: Optional[int] = None
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/wrappers.py
ADDED
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
"""
|
3 |
+
Wrappers around on some nn functions, mainly to support empty tensors.
|
4 |
+
|
5 |
+
Ideally, add support directly in PyTorch to empty tensors in those functions.
|
6 |
+
|
7 |
+
These can be removed once https://github.com/pytorch/pytorch/issues/12013
|
8 |
+
is implemented
|
9 |
+
"""
|
10 |
+
|
11 |
+
import warnings
|
12 |
+
from typing import List, Optional
|
13 |
+
import torch
|
14 |
+
from torch.nn import functional as F
|
15 |
+
|
16 |
+
from annotator.oneformer.detectron2.utils.env import TORCH_VERSION
|
17 |
+
|
18 |
+
|
19 |
+
def shapes_to_tensor(x: List[int], device: Optional[torch.device] = None) -> torch.Tensor:
|
20 |
+
"""
|
21 |
+
Turn a list of integer scalars or integer Tensor scalars into a vector,
|
22 |
+
in a way that's both traceable and scriptable.
|
23 |
+
|
24 |
+
In tracing, `x` should be a list of scalar Tensor, so the output can trace to the inputs.
|
25 |
+
In scripting or eager, `x` should be a list of int.
|
26 |
+
"""
|
27 |
+
if torch.jit.is_scripting():
|
28 |
+
return torch.as_tensor(x, device=device)
|
29 |
+
if torch.jit.is_tracing():
|
30 |
+
assert all(
|
31 |
+
[isinstance(t, torch.Tensor) for t in x]
|
32 |
+
), "Shape should be tensor during tracing!"
|
33 |
+
# as_tensor should not be used in tracing because it records a constant
|
34 |
+
ret = torch.stack(x)
|
35 |
+
if ret.device != device: # avoid recording a hard-coded device if not necessary
|
36 |
+
ret = ret.to(device=device)
|
37 |
+
return ret
|
38 |
+
return torch.as_tensor(x, device=device)
|
39 |
+
|
40 |
+
|
41 |
+
def check_if_dynamo_compiling():
|
42 |
+
if TORCH_VERSION >= (1, 14):
|
43 |
+
from torch._dynamo import is_compiling
|
44 |
+
|
45 |
+
return is_compiling()
|
46 |
+
else:
|
47 |
+
return False
|
48 |
+
|
49 |
+
|
50 |
+
def cat(tensors: List[torch.Tensor], dim: int = 0):
|
51 |
+
"""
|
52 |
+
Efficient version of torch.cat that avoids a copy if there is only a single element in a list
|
53 |
+
"""
|
54 |
+
assert isinstance(tensors, (list, tuple))
|
55 |
+
if len(tensors) == 1:
|
56 |
+
return tensors[0]
|
57 |
+
return torch.cat(tensors, dim)
|
58 |
+
|
59 |
+
|
60 |
+
def empty_input_loss_func_wrapper(loss_func):
|
61 |
+
def wrapped_loss_func(input, target, *, reduction="mean", **kwargs):
|
62 |
+
"""
|
63 |
+
Same as `loss_func`, but returns 0 (instead of nan) for empty inputs.
|
64 |
+
"""
|
65 |
+
if target.numel() == 0 and reduction == "mean":
|
66 |
+
return input.sum() * 0.0 # connect the gradient
|
67 |
+
return loss_func(input, target, reduction=reduction, **kwargs)
|
68 |
+
|
69 |
+
return wrapped_loss_func
|
70 |
+
|
71 |
+
|
72 |
+
cross_entropy = empty_input_loss_func_wrapper(F.cross_entropy)
|
73 |
+
|
74 |
+
|
75 |
+
class _NewEmptyTensorOp(torch.autograd.Function):
|
76 |
+
@staticmethod
|
77 |
+
def forward(ctx, x, new_shape):
|
78 |
+
ctx.shape = x.shape
|
79 |
+
return x.new_empty(new_shape)
|
80 |
+
|
81 |
+
@staticmethod
|
82 |
+
def backward(ctx, grad):
|
83 |
+
shape = ctx.shape
|
84 |
+
return _NewEmptyTensorOp.apply(grad, shape), None
|
85 |
+
|
86 |
+
|
87 |
+
class Conv2d(torch.nn.Conv2d):
|
88 |
+
"""
|
89 |
+
A wrapper around :class:`torch.nn.Conv2d` to support empty inputs and more features.
|
90 |
+
"""
|
91 |
+
|
92 |
+
def __init__(self, *args, **kwargs):
|
93 |
+
"""
|
94 |
+
Extra keyword arguments supported in addition to those in `torch.nn.Conv2d`:
|
95 |
+
|
96 |
+
Args:
|
97 |
+
norm (nn.Module, optional): a normalization layer
|
98 |
+
activation (callable(Tensor) -> Tensor): a callable activation function
|
99 |
+
|
100 |
+
It assumes that norm layer is used before activation.
|
101 |
+
"""
|
102 |
+
norm = kwargs.pop("norm", None)
|
103 |
+
activation = kwargs.pop("activation", None)
|
104 |
+
super().__init__(*args, **kwargs)
|
105 |
+
|
106 |
+
self.norm = norm
|
107 |
+
self.activation = activation
|
108 |
+
|
109 |
+
def forward(self, x):
|
110 |
+
# torchscript does not support SyncBatchNorm yet
|
111 |
+
# https://github.com/pytorch/pytorch/issues/40507
|
112 |
+
# and we skip these codes in torchscript since:
|
113 |
+
# 1. currently we only support torchscript in evaluation mode
|
114 |
+
# 2. features needed by exporting module to torchscript are added in PyTorch 1.6 or
|
115 |
+
# later version, `Conv2d` in these PyTorch versions has already supported empty inputs.
|
116 |
+
if not torch.jit.is_scripting():
|
117 |
+
# Dynamo doesn't support context managers yet
|
118 |
+
is_dynamo_compiling = check_if_dynamo_compiling()
|
119 |
+
if not is_dynamo_compiling:
|
120 |
+
with warnings.catch_warnings(record=True):
|
121 |
+
if x.numel() == 0 and self.training:
|
122 |
+
# https://github.com/pytorch/pytorch/issues/12013
|
123 |
+
assert not isinstance(
|
124 |
+
self.norm, torch.nn.SyncBatchNorm
|
125 |
+
), "SyncBatchNorm does not support empty inputs!"
|
126 |
+
|
127 |
+
x = F.conv2d(
|
128 |
+
x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups
|
129 |
+
)
|
130 |
+
if self.norm is not None:
|
131 |
+
x = self.norm(x)
|
132 |
+
if self.activation is not None:
|
133 |
+
x = self.activation(x)
|
134 |
+
return x
|
135 |
+
|
136 |
+
|
137 |
+
ConvTranspose2d = torch.nn.ConvTranspose2d
|
138 |
+
BatchNorm2d = torch.nn.BatchNorm2d
|
139 |
+
interpolate = F.interpolate
|
140 |
+
Linear = torch.nn.Linear
|
141 |
+
|
142 |
+
|
143 |
+
def nonzero_tuple(x):
|
144 |
+
"""
|
145 |
+
A 'as_tuple=True' version of torch.nonzero to support torchscript.
|
146 |
+
because of https://github.com/pytorch/pytorch/issues/38718
|
147 |
+
"""
|
148 |
+
if torch.jit.is_scripting():
|
149 |
+
if x.dim() == 0:
|
150 |
+
return x.unsqueeze(0).nonzero().unbind(1)
|
151 |
+
return x.nonzero().unbind(1)
|
152 |
+
else:
|
153 |
+
return x.nonzero(as_tuple=True)
|
154 |
+
|
155 |
+
|
156 |
+
@torch.jit.script_if_tracing
|
157 |
+
def move_device_like(src: torch.Tensor, dst: torch.Tensor) -> torch.Tensor:
|
158 |
+
"""
|
159 |
+
Tracing friendly way to cast tensor to another tensor's device. Device will be treated
|
160 |
+
as constant during tracing, scripting the casting process as whole can workaround this issue.
|
161 |
+
"""
|
162 |
+
return src.to(dst.device)
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/model_zoo/__init__.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
"""
|
3 |
+
Model Zoo API for Detectron2: a collection of functions to create common model architectures
|
4 |
+
listed in `MODEL_ZOO.md <https://github.com/facebookresearch/detectron2/blob/main/MODEL_ZOO.md>`_,
|
5 |
+
and optionally load their pre-trained weights.
|
6 |
+
"""
|
7 |
+
|
8 |
+
from .model_zoo import get, get_config_file, get_checkpoint_url, get_config
|
9 |
+
|
10 |
+
__all__ = ["get_checkpoint_url", "get", "get_config_file", "get_config"]
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/model_zoo/model_zoo.py
ADDED
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
import os
|
3 |
+
from typing import Optional
|
4 |
+
import pkg_resources
|
5 |
+
import torch
|
6 |
+
|
7 |
+
from annotator.oneformer.detectron2.checkpoint import DetectionCheckpointer
|
8 |
+
from annotator.oneformer.detectron2.config import CfgNode, LazyConfig, get_cfg, instantiate
|
9 |
+
from annotator.oneformer.detectron2.modeling import build_model
|
10 |
+
|
11 |
+
|
12 |
+
class _ModelZooUrls(object):
|
13 |
+
"""
|
14 |
+
Mapping from names to officially released Detectron2 pre-trained models.
|
15 |
+
"""
|
16 |
+
|
17 |
+
S3_PREFIX = "https://dl.fbaipublicfiles.com/detectron2/"
|
18 |
+
|
19 |
+
# format: {config_path.yaml} -> model_id/model_final_{commit}.pkl
|
20 |
+
CONFIG_PATH_TO_URL_SUFFIX = {
|
21 |
+
# COCO Detection with Faster R-CNN
|
22 |
+
"COCO-Detection/faster_rcnn_R_50_C4_1x": "137257644/model_final_721ade.pkl",
|
23 |
+
"COCO-Detection/faster_rcnn_R_50_DC5_1x": "137847829/model_final_51d356.pkl",
|
24 |
+
"COCO-Detection/faster_rcnn_R_50_FPN_1x": "137257794/model_final_b275ba.pkl",
|
25 |
+
"COCO-Detection/faster_rcnn_R_50_C4_3x": "137849393/model_final_f97cb7.pkl",
|
26 |
+
"COCO-Detection/faster_rcnn_R_50_DC5_3x": "137849425/model_final_68d202.pkl",
|
27 |
+
"COCO-Detection/faster_rcnn_R_50_FPN_3x": "137849458/model_final_280758.pkl",
|
28 |
+
"COCO-Detection/faster_rcnn_R_101_C4_3x": "138204752/model_final_298dad.pkl",
|
29 |
+
"COCO-Detection/faster_rcnn_R_101_DC5_3x": "138204841/model_final_3e0943.pkl",
|
30 |
+
"COCO-Detection/faster_rcnn_R_101_FPN_3x": "137851257/model_final_f6e8b1.pkl",
|
31 |
+
"COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x": "139173657/model_final_68b088.pkl",
|
32 |
+
# COCO Detection with RetinaNet
|
33 |
+
"COCO-Detection/retinanet_R_50_FPN_1x": "190397773/model_final_bfca0b.pkl",
|
34 |
+
"COCO-Detection/retinanet_R_50_FPN_3x": "190397829/model_final_5bd44e.pkl",
|
35 |
+
"COCO-Detection/retinanet_R_101_FPN_3x": "190397697/model_final_971ab9.pkl",
|
36 |
+
# COCO Detection with RPN and Fast R-CNN
|
37 |
+
"COCO-Detection/rpn_R_50_C4_1x": "137258005/model_final_450694.pkl",
|
38 |
+
"COCO-Detection/rpn_R_50_FPN_1x": "137258492/model_final_02ce48.pkl",
|
39 |
+
"COCO-Detection/fast_rcnn_R_50_FPN_1x": "137635226/model_final_e5f7ce.pkl",
|
40 |
+
# COCO Instance Segmentation Baselines with Mask R-CNN
|
41 |
+
"COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x": "137259246/model_final_9243eb.pkl",
|
42 |
+
"COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x": "137260150/model_final_4f86c3.pkl",
|
43 |
+
"COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x": "137260431/model_final_a54504.pkl",
|
44 |
+
"COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x": "137849525/model_final_4ce675.pkl",
|
45 |
+
"COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x": "137849551/model_final_84107b.pkl",
|
46 |
+
"COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x": "137849600/model_final_f10217.pkl",
|
47 |
+
"COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x": "138363239/model_final_a2914c.pkl",
|
48 |
+
"COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x": "138363294/model_final_0464b7.pkl",
|
49 |
+
"COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x": "138205316/model_final_a3ec72.pkl",
|
50 |
+
"COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x": "139653917/model_final_2d9806.pkl", # noqa
|
51 |
+
# New baselines using Large-Scale Jitter and Longer Training Schedule
|
52 |
+
"new_baselines/mask_rcnn_R_50_FPN_100ep_LSJ": "42047764/model_final_bb69de.pkl",
|
53 |
+
"new_baselines/mask_rcnn_R_50_FPN_200ep_LSJ": "42047638/model_final_89a8d3.pkl",
|
54 |
+
"new_baselines/mask_rcnn_R_50_FPN_400ep_LSJ": "42019571/model_final_14d201.pkl",
|
55 |
+
"new_baselines/mask_rcnn_R_101_FPN_100ep_LSJ": "42025812/model_final_4f7b58.pkl",
|
56 |
+
"new_baselines/mask_rcnn_R_101_FPN_200ep_LSJ": "42131867/model_final_0bb7ae.pkl",
|
57 |
+
"new_baselines/mask_rcnn_R_101_FPN_400ep_LSJ": "42073830/model_final_f96b26.pkl",
|
58 |
+
"new_baselines/mask_rcnn_regnetx_4gf_dds_FPN_100ep_LSJ": "42047771/model_final_b7fbab.pkl", # noqa
|
59 |
+
"new_baselines/mask_rcnn_regnetx_4gf_dds_FPN_200ep_LSJ": "42132721/model_final_5d87c1.pkl", # noqa
|
60 |
+
"new_baselines/mask_rcnn_regnetx_4gf_dds_FPN_400ep_LSJ": "42025447/model_final_f1362d.pkl", # noqa
|
61 |
+
"new_baselines/mask_rcnn_regnety_4gf_dds_FPN_100ep_LSJ": "42047784/model_final_6ba57e.pkl", # noqa
|
62 |
+
"new_baselines/mask_rcnn_regnety_4gf_dds_FPN_200ep_LSJ": "42047642/model_final_27b9c1.pkl", # noqa
|
63 |
+
"new_baselines/mask_rcnn_regnety_4gf_dds_FPN_400ep_LSJ": "42045954/model_final_ef3a80.pkl", # noqa
|
64 |
+
# COCO Person Keypoint Detection Baselines with Keypoint R-CNN
|
65 |
+
"COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x": "137261548/model_final_04e291.pkl",
|
66 |
+
"COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x": "137849621/model_final_a6e10b.pkl",
|
67 |
+
"COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x": "138363331/model_final_997cc7.pkl",
|
68 |
+
"COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x": "139686956/model_final_5ad38f.pkl",
|
69 |
+
# COCO Panoptic Segmentation Baselines with Panoptic FPN
|
70 |
+
"COCO-PanopticSegmentation/panoptic_fpn_R_50_1x": "139514544/model_final_dbfeb4.pkl",
|
71 |
+
"COCO-PanopticSegmentation/panoptic_fpn_R_50_3x": "139514569/model_final_c10459.pkl",
|
72 |
+
"COCO-PanopticSegmentation/panoptic_fpn_R_101_3x": "139514519/model_final_cafdb1.pkl",
|
73 |
+
# LVIS Instance Segmentation Baselines with Mask R-CNN
|
74 |
+
"LVISv0.5-InstanceSegmentation/mask_rcnn_R_50_FPN_1x": "144219072/model_final_571f7c.pkl", # noqa
|
75 |
+
"LVISv0.5-InstanceSegmentation/mask_rcnn_R_101_FPN_1x": "144219035/model_final_824ab5.pkl", # noqa
|
76 |
+
"LVISv0.5-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x": "144219108/model_final_5e3439.pkl", # noqa
|
77 |
+
# Cityscapes & Pascal VOC Baselines
|
78 |
+
"Cityscapes/mask_rcnn_R_50_FPN": "142423278/model_final_af9cf5.pkl",
|
79 |
+
"PascalVOC-Detection/faster_rcnn_R_50_C4": "142202221/model_final_b1acc2.pkl",
|
80 |
+
# Other Settings
|
81 |
+
"Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5": "138602867/model_final_65c703.pkl",
|
82 |
+
"Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5": "144998336/model_final_821d0b.pkl",
|
83 |
+
"Misc/cascade_mask_rcnn_R_50_FPN_1x": "138602847/model_final_e9d89b.pkl",
|
84 |
+
"Misc/cascade_mask_rcnn_R_50_FPN_3x": "144998488/model_final_480dd8.pkl",
|
85 |
+
"Misc/mask_rcnn_R_50_FPN_3x_syncbn": "169527823/model_final_3b3c51.pkl",
|
86 |
+
"Misc/mask_rcnn_R_50_FPN_3x_gn": "138602888/model_final_dc5d9e.pkl",
|
87 |
+
"Misc/scratch_mask_rcnn_R_50_FPN_3x_gn": "138602908/model_final_01ca85.pkl",
|
88 |
+
"Misc/scratch_mask_rcnn_R_50_FPN_9x_gn": "183808979/model_final_da7b4c.pkl",
|
89 |
+
"Misc/scratch_mask_rcnn_R_50_FPN_9x_syncbn": "184226666/model_final_5ce33e.pkl",
|
90 |
+
"Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x": "139797668/model_final_be35db.pkl",
|
91 |
+
"Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv": "18131413/model_0039999_e76410.pkl", # noqa
|
92 |
+
# D1 Comparisons
|
93 |
+
"Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x": "137781054/model_final_7ab50c.pkl", # noqa
|
94 |
+
"Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x": "137781281/model_final_62ca52.pkl", # noqa
|
95 |
+
"Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x": "137781195/model_final_cce136.pkl",
|
96 |
+
}
|
97 |
+
|
98 |
+
@staticmethod
|
99 |
+
def query(config_path: str) -> Optional[str]:
|
100 |
+
"""
|
101 |
+
Args:
|
102 |
+
config_path: relative config filename
|
103 |
+
"""
|
104 |
+
name = config_path.replace(".yaml", "").replace(".py", "")
|
105 |
+
if name in _ModelZooUrls.CONFIG_PATH_TO_URL_SUFFIX:
|
106 |
+
suffix = _ModelZooUrls.CONFIG_PATH_TO_URL_SUFFIX[name]
|
107 |
+
return _ModelZooUrls.S3_PREFIX + name + "/" + suffix
|
108 |
+
return None
|
109 |
+
|
110 |
+
|
111 |
+
def get_checkpoint_url(config_path):
|
112 |
+
"""
|
113 |
+
Returns the URL to the model trained using the given config
|
114 |
+
|
115 |
+
Args:
|
116 |
+
config_path (str): config file name relative to detectron2's "configs/"
|
117 |
+
directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"
|
118 |
+
|
119 |
+
Returns:
|
120 |
+
str: a URL to the model
|
121 |
+
"""
|
122 |
+
url = _ModelZooUrls.query(config_path)
|
123 |
+
if url is None:
|
124 |
+
raise RuntimeError("Pretrained model for {} is not available!".format(config_path))
|
125 |
+
return url
|
126 |
+
|
127 |
+
|
128 |
+
def get_config_file(config_path):
|
129 |
+
"""
|
130 |
+
Returns path to a builtin config file.
|
131 |
+
|
132 |
+
Args:
|
133 |
+
config_path (str): config file name relative to detectron2's "configs/"
|
134 |
+
directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"
|
135 |
+
|
136 |
+
Returns:
|
137 |
+
str: the real path to the config file.
|
138 |
+
"""
|
139 |
+
cfg_file = pkg_resources.resource_filename(
|
140 |
+
"detectron2.model_zoo", os.path.join("configs", config_path)
|
141 |
+
)
|
142 |
+
if not os.path.exists(cfg_file):
|
143 |
+
raise RuntimeError("{} not available in Model Zoo!".format(config_path))
|
144 |
+
return cfg_file
|
145 |
+
|
146 |
+
|
147 |
+
def get_config(config_path, trained: bool = False):
|
148 |
+
"""
|
149 |
+
Returns a config object for a model in model zoo.
|
150 |
+
|
151 |
+
Args:
|
152 |
+
config_path (str): config file name relative to detectron2's "configs/"
|
153 |
+
directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"
|
154 |
+
trained (bool): If True, will set ``MODEL.WEIGHTS`` to trained model zoo weights.
|
155 |
+
If False, the checkpoint specified in the config file's ``MODEL.WEIGHTS`` is used
|
156 |
+
instead; this will typically (though not always) initialize a subset of weights using
|
157 |
+
an ImageNet pre-trained model, while randomly initializing the other weights.
|
158 |
+
|
159 |
+
Returns:
|
160 |
+
CfgNode or omegaconf.DictConfig: a config object
|
161 |
+
"""
|
162 |
+
cfg_file = get_config_file(config_path)
|
163 |
+
if cfg_file.endswith(".yaml"):
|
164 |
+
cfg = get_cfg()
|
165 |
+
cfg.merge_from_file(cfg_file)
|
166 |
+
if trained:
|
167 |
+
cfg.MODEL.WEIGHTS = get_checkpoint_url(config_path)
|
168 |
+
return cfg
|
169 |
+
elif cfg_file.endswith(".py"):
|
170 |
+
cfg = LazyConfig.load(cfg_file)
|
171 |
+
if trained:
|
172 |
+
url = get_checkpoint_url(config_path)
|
173 |
+
if "train" in cfg and "init_checkpoint" in cfg.train:
|
174 |
+
cfg.train.init_checkpoint = url
|
175 |
+
else:
|
176 |
+
raise NotImplementedError
|
177 |
+
return cfg
|
178 |
+
|
179 |
+
|
180 |
+
def get(config_path, trained: bool = False, device: Optional[str] = None):
|
181 |
+
"""
|
182 |
+
Get a model specified by relative path under Detectron2's official ``configs/`` directory.
|
183 |
+
|
184 |
+
Args:
|
185 |
+
config_path (str): config file name relative to detectron2's "configs/"
|
186 |
+
directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"
|
187 |
+
trained (bool): see :func:`get_config`.
|
188 |
+
device (str or None): overwrite the device in config, if given.
|
189 |
+
|
190 |
+
Returns:
|
191 |
+
nn.Module: a detectron2 model. Will be in training mode.
|
192 |
+
|
193 |
+
Example:
|
194 |
+
::
|
195 |
+
from annotator.oneformer.detectron2 import model_zoo
|
196 |
+
model = model_zoo.get("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml", trained=True)
|
197 |
+
"""
|
198 |
+
cfg = get_config(config_path, trained)
|
199 |
+
if device is None and not torch.cuda.is_available():
|
200 |
+
device = "cpu"
|
201 |
+
if device is not None and isinstance(cfg, CfgNode):
|
202 |
+
cfg.MODEL.DEVICE = device
|
203 |
+
|
204 |
+
if isinstance(cfg, CfgNode):
|
205 |
+
model = build_model(cfg)
|
206 |
+
DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
|
207 |
+
else:
|
208 |
+
model = instantiate(cfg.model)
|
209 |
+
if device is not None:
|
210 |
+
model = model.to(device)
|
211 |
+
if "train" in cfg and "init_checkpoint" in cfg.train:
|
212 |
+
DetectionCheckpointer(model).load(cfg.train.init_checkpoint)
|
213 |
+
return model
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/__init__.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
from annotator.oneformer.detectron2.layers import ShapeSpec
|
3 |
+
|
4 |
+
from .anchor_generator import build_anchor_generator, ANCHOR_GENERATOR_REGISTRY
|
5 |
+
from .backbone import (
|
6 |
+
BACKBONE_REGISTRY,
|
7 |
+
FPN,
|
8 |
+
Backbone,
|
9 |
+
ResNet,
|
10 |
+
ResNetBlockBase,
|
11 |
+
build_backbone,
|
12 |
+
build_resnet_backbone,
|
13 |
+
make_stage,
|
14 |
+
ViT,
|
15 |
+
SimpleFeaturePyramid,
|
16 |
+
get_vit_lr_decay_rate,
|
17 |
+
MViT,
|
18 |
+
SwinTransformer,
|
19 |
+
)
|
20 |
+
from .meta_arch import (
|
21 |
+
META_ARCH_REGISTRY,
|
22 |
+
SEM_SEG_HEADS_REGISTRY,
|
23 |
+
GeneralizedRCNN,
|
24 |
+
PanopticFPN,
|
25 |
+
ProposalNetwork,
|
26 |
+
RetinaNet,
|
27 |
+
SemanticSegmentor,
|
28 |
+
build_model,
|
29 |
+
build_sem_seg_head,
|
30 |
+
FCOS,
|
31 |
+
)
|
32 |
+
from .postprocessing import detector_postprocess
|
33 |
+
from .proposal_generator import (
|
34 |
+
PROPOSAL_GENERATOR_REGISTRY,
|
35 |
+
build_proposal_generator,
|
36 |
+
RPN_HEAD_REGISTRY,
|
37 |
+
build_rpn_head,
|
38 |
+
)
|
39 |
+
from .roi_heads import (
|
40 |
+
ROI_BOX_HEAD_REGISTRY,
|
41 |
+
ROI_HEADS_REGISTRY,
|
42 |
+
ROI_KEYPOINT_HEAD_REGISTRY,
|
43 |
+
ROI_MASK_HEAD_REGISTRY,
|
44 |
+
ROIHeads,
|
45 |
+
StandardROIHeads,
|
46 |
+
BaseMaskRCNNHead,
|
47 |
+
BaseKeypointRCNNHead,
|
48 |
+
FastRCNNOutputLayers,
|
49 |
+
build_box_head,
|
50 |
+
build_keypoint_head,
|
51 |
+
build_mask_head,
|
52 |
+
build_roi_heads,
|
53 |
+
)
|
54 |
+
from .test_time_augmentation import DatasetMapperTTA, GeneralizedRCNNWithTTA
|
55 |
+
from .mmdet_wrapper import MMDetBackbone, MMDetDetector
|
56 |
+
|
57 |
+
_EXCLUDE = {"ShapeSpec"}
|
58 |
+
__all__ = [k for k in globals().keys() if k not in _EXCLUDE and not k.startswith("_")]
|
59 |
+
|
60 |
+
|
61 |
+
from annotator.oneformer.detectron2.utils.env import fixup_module_metadata
|
62 |
+
|
63 |
+
fixup_module_metadata(__name__, globals(), __all__)
|
64 |
+
del fixup_module_metadata
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/anchor_generator.py
ADDED
@@ -0,0 +1,386 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
import collections
|
3 |
+
import math
|
4 |
+
from typing import List
|
5 |
+
import torch
|
6 |
+
from torch import nn
|
7 |
+
|
8 |
+
from annotator.oneformer.detectron2.config import configurable
|
9 |
+
from annotator.oneformer.detectron2.layers import ShapeSpec, move_device_like
|
10 |
+
from annotator.oneformer.detectron2.structures import Boxes, RotatedBoxes
|
11 |
+
from annotator.oneformer.detectron2.utils.registry import Registry
|
12 |
+
|
13 |
+
ANCHOR_GENERATOR_REGISTRY = Registry("ANCHOR_GENERATOR")
|
14 |
+
ANCHOR_GENERATOR_REGISTRY.__doc__ = """
|
15 |
+
Registry for modules that creates object detection anchors for feature maps.
|
16 |
+
|
17 |
+
The registered object will be called with `obj(cfg, input_shape)`.
|
18 |
+
"""
|
19 |
+
|
20 |
+
|
21 |
+
class BufferList(nn.Module):
|
22 |
+
"""
|
23 |
+
Similar to nn.ParameterList, but for buffers
|
24 |
+
"""
|
25 |
+
|
26 |
+
def __init__(self, buffers):
|
27 |
+
super().__init__()
|
28 |
+
for i, buffer in enumerate(buffers):
|
29 |
+
# Use non-persistent buffer so the values are not saved in checkpoint
|
30 |
+
self.register_buffer(str(i), buffer, persistent=False)
|
31 |
+
|
32 |
+
def __len__(self):
|
33 |
+
return len(self._buffers)
|
34 |
+
|
35 |
+
def __iter__(self):
|
36 |
+
return iter(self._buffers.values())
|
37 |
+
|
38 |
+
|
39 |
+
def _create_grid_offsets(
|
40 |
+
size: List[int], stride: int, offset: float, target_device_tensor: torch.Tensor
|
41 |
+
):
|
42 |
+
grid_height, grid_width = size
|
43 |
+
shifts_x = move_device_like(
|
44 |
+
torch.arange(offset * stride, grid_width * stride, step=stride, dtype=torch.float32),
|
45 |
+
target_device_tensor,
|
46 |
+
)
|
47 |
+
shifts_y = move_device_like(
|
48 |
+
torch.arange(offset * stride, grid_height * stride, step=stride, dtype=torch.float32),
|
49 |
+
target_device_tensor,
|
50 |
+
)
|
51 |
+
|
52 |
+
shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
|
53 |
+
shift_x = shift_x.reshape(-1)
|
54 |
+
shift_y = shift_y.reshape(-1)
|
55 |
+
return shift_x, shift_y
|
56 |
+
|
57 |
+
|
58 |
+
def _broadcast_params(params, num_features, name):
|
59 |
+
"""
|
60 |
+
If one size (or aspect ratio) is specified and there are multiple feature
|
61 |
+
maps, we "broadcast" anchors of that single size (or aspect ratio)
|
62 |
+
over all feature maps.
|
63 |
+
|
64 |
+
If params is list[float], or list[list[float]] with len(params) == 1, repeat
|
65 |
+
it num_features time.
|
66 |
+
|
67 |
+
Returns:
|
68 |
+
list[list[float]]: param for each feature
|
69 |
+
"""
|
70 |
+
assert isinstance(
|
71 |
+
params, collections.abc.Sequence
|
72 |
+
), f"{name} in anchor generator has to be a list! Got {params}."
|
73 |
+
assert len(params), f"{name} in anchor generator cannot be empty!"
|
74 |
+
if not isinstance(params[0], collections.abc.Sequence): # params is list[float]
|
75 |
+
return [params] * num_features
|
76 |
+
if len(params) == 1:
|
77 |
+
return list(params) * num_features
|
78 |
+
assert len(params) == num_features, (
|
79 |
+
f"Got {name} of length {len(params)} in anchor generator, "
|
80 |
+
f"but the number of input features is {num_features}!"
|
81 |
+
)
|
82 |
+
return params
|
83 |
+
|
84 |
+
|
85 |
+
@ANCHOR_GENERATOR_REGISTRY.register()
|
86 |
+
class DefaultAnchorGenerator(nn.Module):
|
87 |
+
"""
|
88 |
+
Compute anchors in the standard ways described in
|
89 |
+
"Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks".
|
90 |
+
"""
|
91 |
+
|
92 |
+
box_dim: torch.jit.Final[int] = 4
|
93 |
+
"""
|
94 |
+
the dimension of each anchor box.
|
95 |
+
"""
|
96 |
+
|
97 |
+
@configurable
|
98 |
+
def __init__(self, *, sizes, aspect_ratios, strides, offset=0.5):
|
99 |
+
"""
|
100 |
+
This interface is experimental.
|
101 |
+
|
102 |
+
Args:
|
103 |
+
sizes (list[list[float]] or list[float]):
|
104 |
+
If ``sizes`` is list[list[float]], ``sizes[i]`` is the list of anchor sizes
|
105 |
+
(i.e. sqrt of anchor area) to use for the i-th feature map.
|
106 |
+
If ``sizes`` is list[float], ``sizes`` is used for all feature maps.
|
107 |
+
Anchor sizes are given in absolute lengths in units of
|
108 |
+
the input image; they do not dynamically scale if the input image size changes.
|
109 |
+
aspect_ratios (list[list[float]] or list[float]): list of aspect ratios
|
110 |
+
(i.e. height / width) to use for anchors. Same "broadcast" rule for `sizes` applies.
|
111 |
+
strides (list[int]): stride of each input feature.
|
112 |
+
offset (float): Relative offset between the center of the first anchor and the top-left
|
113 |
+
corner of the image. Value has to be in [0, 1).
|
114 |
+
Recommend to use 0.5, which means half stride.
|
115 |
+
"""
|
116 |
+
super().__init__()
|
117 |
+
|
118 |
+
self.strides = strides
|
119 |
+
self.num_features = len(self.strides)
|
120 |
+
sizes = _broadcast_params(sizes, self.num_features, "sizes")
|
121 |
+
aspect_ratios = _broadcast_params(aspect_ratios, self.num_features, "aspect_ratios")
|
122 |
+
self.cell_anchors = self._calculate_anchors(sizes, aspect_ratios)
|
123 |
+
|
124 |
+
self.offset = offset
|
125 |
+
assert 0.0 <= self.offset < 1.0, self.offset
|
126 |
+
|
127 |
+
@classmethod
|
128 |
+
def from_config(cls, cfg, input_shape: List[ShapeSpec]):
|
129 |
+
return {
|
130 |
+
"sizes": cfg.MODEL.ANCHOR_GENERATOR.SIZES,
|
131 |
+
"aspect_ratios": cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS,
|
132 |
+
"strides": [x.stride for x in input_shape],
|
133 |
+
"offset": cfg.MODEL.ANCHOR_GENERATOR.OFFSET,
|
134 |
+
}
|
135 |
+
|
136 |
+
def _calculate_anchors(self, sizes, aspect_ratios):
|
137 |
+
cell_anchors = [
|
138 |
+
self.generate_cell_anchors(s, a).float() for s, a in zip(sizes, aspect_ratios)
|
139 |
+
]
|
140 |
+
return BufferList(cell_anchors)
|
141 |
+
|
142 |
+
@property
|
143 |
+
@torch.jit.unused
|
144 |
+
def num_cell_anchors(self):
|
145 |
+
"""
|
146 |
+
Alias of `num_anchors`.
|
147 |
+
"""
|
148 |
+
return self.num_anchors
|
149 |
+
|
150 |
+
@property
|
151 |
+
@torch.jit.unused
|
152 |
+
def num_anchors(self):
|
153 |
+
"""
|
154 |
+
Returns:
|
155 |
+
list[int]: Each int is the number of anchors at every pixel
|
156 |
+
location, on that feature map.
|
157 |
+
For example, if at every pixel we use anchors of 3 aspect
|
158 |
+
ratios and 5 sizes, the number of anchors is 15.
|
159 |
+
(See also ANCHOR_GENERATOR.SIZES and ANCHOR_GENERATOR.ASPECT_RATIOS in config)
|
160 |
+
|
161 |
+
In standard RPN models, `num_anchors` on every feature map is the same.
|
162 |
+
"""
|
163 |
+
return [len(cell_anchors) for cell_anchors in self.cell_anchors]
|
164 |
+
|
165 |
+
def _grid_anchors(self, grid_sizes: List[List[int]]):
|
166 |
+
"""
|
167 |
+
Returns:
|
168 |
+
list[Tensor]: #featuremap tensors, each is (#locations x #cell_anchors) x 4
|
169 |
+
"""
|
170 |
+
anchors = []
|
171 |
+
# buffers() not supported by torchscript. use named_buffers() instead
|
172 |
+
buffers: List[torch.Tensor] = [x[1] for x in self.cell_anchors.named_buffers()]
|
173 |
+
for size, stride, base_anchors in zip(grid_sizes, self.strides, buffers):
|
174 |
+
shift_x, shift_y = _create_grid_offsets(size, stride, self.offset, base_anchors)
|
175 |
+
shifts = torch.stack((shift_x, shift_y, shift_x, shift_y), dim=1)
|
176 |
+
|
177 |
+
anchors.append((shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)).reshape(-1, 4))
|
178 |
+
|
179 |
+
return anchors
|
180 |
+
|
181 |
+
def generate_cell_anchors(self, sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.5, 1, 2)):
|
182 |
+
"""
|
183 |
+
Generate a tensor storing canonical anchor boxes, which are all anchor
|
184 |
+
boxes of different sizes and aspect_ratios centered at (0, 0).
|
185 |
+
We can later build the set of anchors for a full feature map by
|
186 |
+
shifting and tiling these tensors (see `meth:_grid_anchors`).
|
187 |
+
|
188 |
+
Args:
|
189 |
+
sizes (tuple[float]):
|
190 |
+
aspect_ratios (tuple[float]]):
|
191 |
+
|
192 |
+
Returns:
|
193 |
+
Tensor of shape (len(sizes) * len(aspect_ratios), 4) storing anchor boxes
|
194 |
+
in XYXY format.
|
195 |
+
"""
|
196 |
+
|
197 |
+
# This is different from the anchor generator defined in the original Faster R-CNN
|
198 |
+
# code or Detectron. They yield the same AP, however the old version defines cell
|
199 |
+
# anchors in a less natural way with a shift relative to the feature grid and
|
200 |
+
# quantization that results in slightly different sizes for different aspect ratios.
|
201 |
+
# See also https://github.com/facebookresearch/Detectron/issues/227
|
202 |
+
|
203 |
+
anchors = []
|
204 |
+
for size in sizes:
|
205 |
+
area = size**2.0
|
206 |
+
for aspect_ratio in aspect_ratios:
|
207 |
+
# s * s = w * h
|
208 |
+
# a = h / w
|
209 |
+
# ... some algebra ...
|
210 |
+
# w = sqrt(s * s / a)
|
211 |
+
# h = a * w
|
212 |
+
w = math.sqrt(area / aspect_ratio)
|
213 |
+
h = aspect_ratio * w
|
214 |
+
x0, y0, x1, y1 = -w / 2.0, -h / 2.0, w / 2.0, h / 2.0
|
215 |
+
anchors.append([x0, y0, x1, y1])
|
216 |
+
return torch.tensor(anchors)
|
217 |
+
|
218 |
+
def forward(self, features: List[torch.Tensor]):
|
219 |
+
"""
|
220 |
+
Args:
|
221 |
+
features (list[Tensor]): list of backbone feature maps on which to generate anchors.
|
222 |
+
|
223 |
+
Returns:
|
224 |
+
list[Boxes]: a list of Boxes containing all the anchors for each feature map
|
225 |
+
(i.e. the cell anchors repeated over all locations in the feature map).
|
226 |
+
The number of anchors of each feature map is Hi x Wi x num_cell_anchors,
|
227 |
+
where Hi, Wi are resolution of the feature map divided by anchor stride.
|
228 |
+
"""
|
229 |
+
grid_sizes = [feature_map.shape[-2:] for feature_map in features]
|
230 |
+
anchors_over_all_feature_maps = self._grid_anchors(grid_sizes)
|
231 |
+
return [Boxes(x) for x in anchors_over_all_feature_maps]
|
232 |
+
|
233 |
+
|
234 |
+
@ANCHOR_GENERATOR_REGISTRY.register()
|
235 |
+
class RotatedAnchorGenerator(nn.Module):
|
236 |
+
"""
|
237 |
+
Compute rotated anchors used by Rotated RPN (RRPN), described in
|
238 |
+
"Arbitrary-Oriented Scene Text Detection via Rotation Proposals".
|
239 |
+
"""
|
240 |
+
|
241 |
+
box_dim: int = 5
|
242 |
+
"""
|
243 |
+
the dimension of each anchor box.
|
244 |
+
"""
|
245 |
+
|
246 |
+
@configurable
|
247 |
+
def __init__(self, *, sizes, aspect_ratios, strides, angles, offset=0.5):
|
248 |
+
"""
|
249 |
+
This interface is experimental.
|
250 |
+
|
251 |
+
Args:
|
252 |
+
sizes (list[list[float]] or list[float]):
|
253 |
+
If sizes is list[list[float]], sizes[i] is the list of anchor sizes
|
254 |
+
(i.e. sqrt of anchor area) to use for the i-th feature map.
|
255 |
+
If sizes is list[float], the sizes are used for all feature maps.
|
256 |
+
Anchor sizes are given in absolute lengths in units of
|
257 |
+
the input image; they do not dynamically scale if the input image size changes.
|
258 |
+
aspect_ratios (list[list[float]] or list[float]): list of aspect ratios
|
259 |
+
(i.e. height / width) to use for anchors. Same "broadcast" rule for `sizes` applies.
|
260 |
+
strides (list[int]): stride of each input feature.
|
261 |
+
angles (list[list[float]] or list[float]): list of angles (in degrees CCW)
|
262 |
+
to use for anchors. Same "broadcast" rule for `sizes` applies.
|
263 |
+
offset (float): Relative offset between the center of the first anchor and the top-left
|
264 |
+
corner of the image. Value has to be in [0, 1).
|
265 |
+
Recommend to use 0.5, which means half stride.
|
266 |
+
"""
|
267 |
+
super().__init__()
|
268 |
+
|
269 |
+
self.strides = strides
|
270 |
+
self.num_features = len(self.strides)
|
271 |
+
sizes = _broadcast_params(sizes, self.num_features, "sizes")
|
272 |
+
aspect_ratios = _broadcast_params(aspect_ratios, self.num_features, "aspect_ratios")
|
273 |
+
angles = _broadcast_params(angles, self.num_features, "angles")
|
274 |
+
self.cell_anchors = self._calculate_anchors(sizes, aspect_ratios, angles)
|
275 |
+
|
276 |
+
self.offset = offset
|
277 |
+
assert 0.0 <= self.offset < 1.0, self.offset
|
278 |
+
|
279 |
+
@classmethod
|
280 |
+
def from_config(cls, cfg, input_shape: List[ShapeSpec]):
|
281 |
+
return {
|
282 |
+
"sizes": cfg.MODEL.ANCHOR_GENERATOR.SIZES,
|
283 |
+
"aspect_ratios": cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS,
|
284 |
+
"strides": [x.stride for x in input_shape],
|
285 |
+
"offset": cfg.MODEL.ANCHOR_GENERATOR.OFFSET,
|
286 |
+
"angles": cfg.MODEL.ANCHOR_GENERATOR.ANGLES,
|
287 |
+
}
|
288 |
+
|
289 |
+
def _calculate_anchors(self, sizes, aspect_ratios, angles):
|
290 |
+
cell_anchors = [
|
291 |
+
self.generate_cell_anchors(size, aspect_ratio, angle).float()
|
292 |
+
for size, aspect_ratio, angle in zip(sizes, aspect_ratios, angles)
|
293 |
+
]
|
294 |
+
return BufferList(cell_anchors)
|
295 |
+
|
296 |
+
@property
|
297 |
+
def num_cell_anchors(self):
|
298 |
+
"""
|
299 |
+
Alias of `num_anchors`.
|
300 |
+
"""
|
301 |
+
return self.num_anchors
|
302 |
+
|
303 |
+
@property
|
304 |
+
def num_anchors(self):
|
305 |
+
"""
|
306 |
+
Returns:
|
307 |
+
list[int]: Each int is the number of anchors at every pixel
|
308 |
+
location, on that feature map.
|
309 |
+
For example, if at every pixel we use anchors of 3 aspect
|
310 |
+
ratios, 2 sizes and 5 angles, the number of anchors is 30.
|
311 |
+
(See also ANCHOR_GENERATOR.SIZES, ANCHOR_GENERATOR.ASPECT_RATIOS
|
312 |
+
and ANCHOR_GENERATOR.ANGLES in config)
|
313 |
+
|
314 |
+
In standard RRPN models, `num_anchors` on every feature map is the same.
|
315 |
+
"""
|
316 |
+
return [len(cell_anchors) for cell_anchors in self.cell_anchors]
|
317 |
+
|
318 |
+
def _grid_anchors(self, grid_sizes):
|
319 |
+
anchors = []
|
320 |
+
for size, stride, base_anchors in zip(grid_sizes, self.strides, self.cell_anchors):
|
321 |
+
shift_x, shift_y = _create_grid_offsets(size, stride, self.offset, base_anchors)
|
322 |
+
zeros = torch.zeros_like(shift_x)
|
323 |
+
shifts = torch.stack((shift_x, shift_y, zeros, zeros, zeros), dim=1)
|
324 |
+
|
325 |
+
anchors.append((shifts.view(-1, 1, 5) + base_anchors.view(1, -1, 5)).reshape(-1, 5))
|
326 |
+
|
327 |
+
return anchors
|
328 |
+
|
329 |
+
def generate_cell_anchors(
|
330 |
+
self,
|
331 |
+
sizes=(32, 64, 128, 256, 512),
|
332 |
+
aspect_ratios=(0.5, 1, 2),
|
333 |
+
angles=(-90, -60, -30, 0, 30, 60, 90),
|
334 |
+
):
|
335 |
+
"""
|
336 |
+
Generate a tensor storing canonical anchor boxes, which are all anchor
|
337 |
+
boxes of different sizes, aspect_ratios, angles centered at (0, 0).
|
338 |
+
We can later build the set of anchors for a full feature map by
|
339 |
+
shifting and tiling these tensors (see `meth:_grid_anchors`).
|
340 |
+
|
341 |
+
Args:
|
342 |
+
sizes (tuple[float]):
|
343 |
+
aspect_ratios (tuple[float]]):
|
344 |
+
angles (tuple[float]]):
|
345 |
+
|
346 |
+
Returns:
|
347 |
+
Tensor of shape (len(sizes) * len(aspect_ratios) * len(angles), 5)
|
348 |
+
storing anchor boxes in (x_ctr, y_ctr, w, h, angle) format.
|
349 |
+
"""
|
350 |
+
anchors = []
|
351 |
+
for size in sizes:
|
352 |
+
area = size**2.0
|
353 |
+
for aspect_ratio in aspect_ratios:
|
354 |
+
# s * s = w * h
|
355 |
+
# a = h / w
|
356 |
+
# ... some algebra ...
|
357 |
+
# w = sqrt(s * s / a)
|
358 |
+
# h = a * w
|
359 |
+
w = math.sqrt(area / aspect_ratio)
|
360 |
+
h = aspect_ratio * w
|
361 |
+
anchors.extend([0, 0, w, h, a] for a in angles)
|
362 |
+
|
363 |
+
return torch.tensor(anchors)
|
364 |
+
|
365 |
+
def forward(self, features):
|
366 |
+
"""
|
367 |
+
Args:
|
368 |
+
features (list[Tensor]): list of backbone feature maps on which to generate anchors.
|
369 |
+
|
370 |
+
Returns:
|
371 |
+
list[RotatedBoxes]: a list of Boxes containing all the anchors for each feature map
|
372 |
+
(i.e. the cell anchors repeated over all locations in the feature map).
|
373 |
+
The number of anchors of each feature map is Hi x Wi x num_cell_anchors,
|
374 |
+
where Hi, Wi are resolution of the feature map divided by anchor stride.
|
375 |
+
"""
|
376 |
+
grid_sizes = [feature_map.shape[-2:] for feature_map in features]
|
377 |
+
anchors_over_all_feature_maps = self._grid_anchors(grid_sizes)
|
378 |
+
return [RotatedBoxes(x) for x in anchors_over_all_feature_maps]
|
379 |
+
|
380 |
+
|
381 |
+
def build_anchor_generator(cfg, input_shape):
|
382 |
+
"""
|
383 |
+
Built an anchor generator from `cfg.MODEL.ANCHOR_GENERATOR.NAME`.
|
384 |
+
"""
|
385 |
+
anchor_generator = cfg.MODEL.ANCHOR_GENERATOR.NAME
|
386 |
+
return ANCHOR_GENERATOR_REGISTRY.get(anchor_generator)(cfg, input_shape)
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/__init__.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
from .build import build_backbone, BACKBONE_REGISTRY # noqa F401 isort:skip
|
3 |
+
|
4 |
+
from .backbone import Backbone
|
5 |
+
from .fpn import FPN
|
6 |
+
from .regnet import RegNet
|
7 |
+
from .resnet import (
|
8 |
+
BasicStem,
|
9 |
+
ResNet,
|
10 |
+
ResNetBlockBase,
|
11 |
+
build_resnet_backbone,
|
12 |
+
make_stage,
|
13 |
+
BottleneckBlock,
|
14 |
+
)
|
15 |
+
from .vit import ViT, SimpleFeaturePyramid, get_vit_lr_decay_rate
|
16 |
+
from .mvit import MViT
|
17 |
+
from .swin import SwinTransformer
|
18 |
+
|
19 |
+
__all__ = [k for k in globals().keys() if not k.startswith("_")]
|
20 |
+
# TODO can expose more resnet blocks after careful consideration
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/backbone.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
from abc import ABCMeta, abstractmethod
|
3 |
+
from typing import Dict
|
4 |
+
import torch.nn as nn
|
5 |
+
|
6 |
+
from annotator.oneformer.detectron2.layers import ShapeSpec
|
7 |
+
|
8 |
+
__all__ = ["Backbone"]
|
9 |
+
|
10 |
+
|
11 |
+
class Backbone(nn.Module, metaclass=ABCMeta):
|
12 |
+
"""
|
13 |
+
Abstract base class for network backbones.
|
14 |
+
"""
|
15 |
+
|
16 |
+
def __init__(self):
|
17 |
+
"""
|
18 |
+
The `__init__` method of any subclass can specify its own set of arguments.
|
19 |
+
"""
|
20 |
+
super().__init__()
|
21 |
+
|
22 |
+
@abstractmethod
|
23 |
+
def forward(self):
|
24 |
+
"""
|
25 |
+
Subclasses must override this method, but adhere to the same return type.
|
26 |
+
|
27 |
+
Returns:
|
28 |
+
dict[str->Tensor]: mapping from feature name (e.g., "res2") to tensor
|
29 |
+
"""
|
30 |
+
pass
|
31 |
+
|
32 |
+
@property
|
33 |
+
def size_divisibility(self) -> int:
|
34 |
+
"""
|
35 |
+
Some backbones require the input height and width to be divisible by a
|
36 |
+
specific integer. This is typically true for encoder / decoder type networks
|
37 |
+
with lateral connection (e.g., FPN) for which feature maps need to match
|
38 |
+
dimension in the "bottom up" and "top down" paths. Set to 0 if no specific
|
39 |
+
input size divisibility is required.
|
40 |
+
"""
|
41 |
+
return 0
|
42 |
+
|
43 |
+
@property
|
44 |
+
def padding_constraints(self) -> Dict[str, int]:
|
45 |
+
"""
|
46 |
+
This property is a generalization of size_divisibility. Some backbones and training
|
47 |
+
recipes require specific padding constraints, such as enforcing divisibility by a specific
|
48 |
+
integer (e.g., FPN) or padding to a square (e.g., ViTDet with large-scale jitter
|
49 |
+
in :paper:vitdet). `padding_constraints` contains these optional items like:
|
50 |
+
{
|
51 |
+
"size_divisibility": int,
|
52 |
+
"square_size": int,
|
53 |
+
# Future options are possible
|
54 |
+
}
|
55 |
+
`size_divisibility` will read from here if presented and `square_size` indicates the
|
56 |
+
square padding size if `square_size` > 0.
|
57 |
+
|
58 |
+
TODO: use type of Dict[str, int] to avoid torchscipt issues. The type of padding_constraints
|
59 |
+
could be generalized as TypedDict (Python 3.8+) to support more types in the future.
|
60 |
+
"""
|
61 |
+
return {}
|
62 |
+
|
63 |
+
def output_shape(self):
|
64 |
+
"""
|
65 |
+
Returns:
|
66 |
+
dict[str->ShapeSpec]
|
67 |
+
"""
|
68 |
+
# this is a backward-compatible default
|
69 |
+
return {
|
70 |
+
name: ShapeSpec(
|
71 |
+
channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
|
72 |
+
)
|
73 |
+
for name in self._out_features
|
74 |
+
}
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/build.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
from annotator.oneformer.detectron2.layers import ShapeSpec
|
3 |
+
from annotator.oneformer.detectron2.utils.registry import Registry
|
4 |
+
|
5 |
+
from .backbone import Backbone
|
6 |
+
|
7 |
+
BACKBONE_REGISTRY = Registry("BACKBONE")
|
8 |
+
BACKBONE_REGISTRY.__doc__ = """
|
9 |
+
Registry for backbones, which extract feature maps from images
|
10 |
+
|
11 |
+
The registered object must be a callable that accepts two arguments:
|
12 |
+
|
13 |
+
1. A :class:`detectron2.config.CfgNode`
|
14 |
+
2. A :class:`detectron2.layers.ShapeSpec`, which contains the input shape specification.
|
15 |
+
|
16 |
+
Registered object must return instance of :class:`Backbone`.
|
17 |
+
"""
|
18 |
+
|
19 |
+
|
20 |
+
def build_backbone(cfg, input_shape=None):
|
21 |
+
"""
|
22 |
+
Build a backbone from `cfg.MODEL.BACKBONE.NAME`.
|
23 |
+
|
24 |
+
Returns:
|
25 |
+
an instance of :class:`Backbone`
|
26 |
+
"""
|
27 |
+
if input_shape is None:
|
28 |
+
input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))
|
29 |
+
|
30 |
+
backbone_name = cfg.MODEL.BACKBONE.NAME
|
31 |
+
backbone = BACKBONE_REGISTRY.get(backbone_name)(cfg, input_shape)
|
32 |
+
assert isinstance(backbone, Backbone)
|
33 |
+
return backbone
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/fpn.py
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
import math
|
3 |
+
import fvcore.nn.weight_init as weight_init
|
4 |
+
import torch
|
5 |
+
import torch.nn.functional as F
|
6 |
+
from torch import nn
|
7 |
+
|
8 |
+
from annotator.oneformer.detectron2.layers import Conv2d, ShapeSpec, get_norm
|
9 |
+
|
10 |
+
from .backbone import Backbone
|
11 |
+
from .build import BACKBONE_REGISTRY
|
12 |
+
from .resnet import build_resnet_backbone
|
13 |
+
|
14 |
+
__all__ = ["build_resnet_fpn_backbone", "build_retinanet_resnet_fpn_backbone", "FPN"]
|
15 |
+
|
16 |
+
|
17 |
+
class FPN(Backbone):
|
18 |
+
"""
|
19 |
+
This module implements :paper:`FPN`.
|
20 |
+
It creates pyramid features built on top of some input feature maps.
|
21 |
+
"""
|
22 |
+
|
23 |
+
_fuse_type: torch.jit.Final[str]
|
24 |
+
|
25 |
+
def __init__(
|
26 |
+
self,
|
27 |
+
bottom_up,
|
28 |
+
in_features,
|
29 |
+
out_channels,
|
30 |
+
norm="",
|
31 |
+
top_block=None,
|
32 |
+
fuse_type="sum",
|
33 |
+
square_pad=0,
|
34 |
+
):
|
35 |
+
"""
|
36 |
+
Args:
|
37 |
+
bottom_up (Backbone): module representing the bottom up subnetwork.
|
38 |
+
Must be a subclass of :class:`Backbone`. The multi-scale feature
|
39 |
+
maps generated by the bottom up network, and listed in `in_features`,
|
40 |
+
are used to generate FPN levels.
|
41 |
+
in_features (list[str]): names of the input feature maps coming
|
42 |
+
from the backbone to which FPN is attached. For example, if the
|
43 |
+
backbone produces ["res2", "res3", "res4"], any *contiguous* sublist
|
44 |
+
of these may be used; order must be from high to low resolution.
|
45 |
+
out_channels (int): number of channels in the output feature maps.
|
46 |
+
norm (str): the normalization to use.
|
47 |
+
top_block (nn.Module or None): if provided, an extra operation will
|
48 |
+
be performed on the output of the last (smallest resolution)
|
49 |
+
FPN output, and the result will extend the result list. The top_block
|
50 |
+
further downsamples the feature map. It must have an attribute
|
51 |
+
"num_levels", meaning the number of extra FPN levels added by
|
52 |
+
this block, and "in_feature", which is a string representing
|
53 |
+
its input feature (e.g., p5).
|
54 |
+
fuse_type (str): types for fusing the top down features and the lateral
|
55 |
+
ones. It can be "sum" (default), which sums up element-wise; or "avg",
|
56 |
+
which takes the element-wise mean of the two.
|
57 |
+
square_pad (int): If > 0, require input images to be padded to specific square size.
|
58 |
+
"""
|
59 |
+
super(FPN, self).__init__()
|
60 |
+
assert isinstance(bottom_up, Backbone)
|
61 |
+
assert in_features, in_features
|
62 |
+
|
63 |
+
# Feature map strides and channels from the bottom up network (e.g. ResNet)
|
64 |
+
input_shapes = bottom_up.output_shape()
|
65 |
+
strides = [input_shapes[f].stride for f in in_features]
|
66 |
+
in_channels_per_feature = [input_shapes[f].channels for f in in_features]
|
67 |
+
|
68 |
+
_assert_strides_are_log2_contiguous(strides)
|
69 |
+
lateral_convs = []
|
70 |
+
output_convs = []
|
71 |
+
|
72 |
+
use_bias = norm == ""
|
73 |
+
for idx, in_channels in enumerate(in_channels_per_feature):
|
74 |
+
lateral_norm = get_norm(norm, out_channels)
|
75 |
+
output_norm = get_norm(norm, out_channels)
|
76 |
+
|
77 |
+
lateral_conv = Conv2d(
|
78 |
+
in_channels, out_channels, kernel_size=1, bias=use_bias, norm=lateral_norm
|
79 |
+
)
|
80 |
+
output_conv = Conv2d(
|
81 |
+
out_channels,
|
82 |
+
out_channels,
|
83 |
+
kernel_size=3,
|
84 |
+
stride=1,
|
85 |
+
padding=1,
|
86 |
+
bias=use_bias,
|
87 |
+
norm=output_norm,
|
88 |
+
)
|
89 |
+
weight_init.c2_xavier_fill(lateral_conv)
|
90 |
+
weight_init.c2_xavier_fill(output_conv)
|
91 |
+
stage = int(math.log2(strides[idx]))
|
92 |
+
self.add_module("fpn_lateral{}".format(stage), lateral_conv)
|
93 |
+
self.add_module("fpn_output{}".format(stage), output_conv)
|
94 |
+
|
95 |
+
lateral_convs.append(lateral_conv)
|
96 |
+
output_convs.append(output_conv)
|
97 |
+
# Place convs into top-down order (from low to high resolution)
|
98 |
+
# to make the top-down computation in forward clearer.
|
99 |
+
self.lateral_convs = lateral_convs[::-1]
|
100 |
+
self.output_convs = output_convs[::-1]
|
101 |
+
self.top_block = top_block
|
102 |
+
self.in_features = tuple(in_features)
|
103 |
+
self.bottom_up = bottom_up
|
104 |
+
# Return feature names are "p<stage>", like ["p2", "p3", ..., "p6"]
|
105 |
+
self._out_feature_strides = {"p{}".format(int(math.log2(s))): s for s in strides}
|
106 |
+
# top block output feature maps.
|
107 |
+
if self.top_block is not None:
|
108 |
+
for s in range(stage, stage + self.top_block.num_levels):
|
109 |
+
self._out_feature_strides["p{}".format(s + 1)] = 2 ** (s + 1)
|
110 |
+
|
111 |
+
self._out_features = list(self._out_feature_strides.keys())
|
112 |
+
self._out_feature_channels = {k: out_channels for k in self._out_features}
|
113 |
+
self._size_divisibility = strides[-1]
|
114 |
+
self._square_pad = square_pad
|
115 |
+
assert fuse_type in {"avg", "sum"}
|
116 |
+
self._fuse_type = fuse_type
|
117 |
+
|
118 |
+
@property
|
119 |
+
def size_divisibility(self):
|
120 |
+
return self._size_divisibility
|
121 |
+
|
122 |
+
@property
|
123 |
+
def padding_constraints(self):
|
124 |
+
return {"square_size": self._square_pad}
|
125 |
+
|
126 |
+
def forward(self, x):
|
127 |
+
"""
|
128 |
+
Args:
|
129 |
+
input (dict[str->Tensor]): mapping feature map name (e.g., "res5") to
|
130 |
+
feature map tensor for each feature level in high to low resolution order.
|
131 |
+
|
132 |
+
Returns:
|
133 |
+
dict[str->Tensor]:
|
134 |
+
mapping from feature map name to FPN feature map tensor
|
135 |
+
in high to low resolution order. Returned feature names follow the FPN
|
136 |
+
paper convention: "p<stage>", where stage has stride = 2 ** stage e.g.,
|
137 |
+
["p2", "p3", ..., "p6"].
|
138 |
+
"""
|
139 |
+
bottom_up_features = self.bottom_up(x)
|
140 |
+
results = []
|
141 |
+
prev_features = self.lateral_convs[0](bottom_up_features[self.in_features[-1]])
|
142 |
+
results.append(self.output_convs[0](prev_features))
|
143 |
+
|
144 |
+
# Reverse feature maps into top-down order (from low to high resolution)
|
145 |
+
for idx, (lateral_conv, output_conv) in enumerate(
|
146 |
+
zip(self.lateral_convs, self.output_convs)
|
147 |
+
):
|
148 |
+
# Slicing of ModuleList is not supported https://github.com/pytorch/pytorch/issues/47336
|
149 |
+
# Therefore we loop over all modules but skip the first one
|
150 |
+
if idx > 0:
|
151 |
+
features = self.in_features[-idx - 1]
|
152 |
+
features = bottom_up_features[features]
|
153 |
+
top_down_features = F.interpolate(prev_features, scale_factor=2.0, mode="nearest")
|
154 |
+
lateral_features = lateral_conv(features)
|
155 |
+
prev_features = lateral_features + top_down_features
|
156 |
+
if self._fuse_type == "avg":
|
157 |
+
prev_features /= 2
|
158 |
+
results.insert(0, output_conv(prev_features))
|
159 |
+
|
160 |
+
if self.top_block is not None:
|
161 |
+
if self.top_block.in_feature in bottom_up_features:
|
162 |
+
top_block_in_feature = bottom_up_features[self.top_block.in_feature]
|
163 |
+
else:
|
164 |
+
top_block_in_feature = results[self._out_features.index(self.top_block.in_feature)]
|
165 |
+
results.extend(self.top_block(top_block_in_feature))
|
166 |
+
assert len(self._out_features) == len(results)
|
167 |
+
return {f: res for f, res in zip(self._out_features, results)}
|
168 |
+
|
169 |
+
def output_shape(self):
|
170 |
+
return {
|
171 |
+
name: ShapeSpec(
|
172 |
+
channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
|
173 |
+
)
|
174 |
+
for name in self._out_features
|
175 |
+
}
|
176 |
+
|
177 |
+
|
178 |
+
def _assert_strides_are_log2_contiguous(strides):
|
179 |
+
"""
|
180 |
+
Assert that each stride is 2x times its preceding stride, i.e. "contiguous in log2".
|
181 |
+
"""
|
182 |
+
for i, stride in enumerate(strides[1:], 1):
|
183 |
+
assert stride == 2 * strides[i - 1], "Strides {} {} are not log2 contiguous".format(
|
184 |
+
stride, strides[i - 1]
|
185 |
+
)
|
186 |
+
|
187 |
+
|
188 |
+
class LastLevelMaxPool(nn.Module):
|
189 |
+
"""
|
190 |
+
This module is used in the original FPN to generate a downsampled
|
191 |
+
P6 feature from P5.
|
192 |
+
"""
|
193 |
+
|
194 |
+
def __init__(self):
|
195 |
+
super().__init__()
|
196 |
+
self.num_levels = 1
|
197 |
+
self.in_feature = "p5"
|
198 |
+
|
199 |
+
def forward(self, x):
|
200 |
+
return [F.max_pool2d(x, kernel_size=1, stride=2, padding=0)]
|
201 |
+
|
202 |
+
|
203 |
+
class LastLevelP6P7(nn.Module):
|
204 |
+
"""
|
205 |
+
This module is used in RetinaNet to generate extra layers, P6 and P7 from
|
206 |
+
C5 feature.
|
207 |
+
"""
|
208 |
+
|
209 |
+
def __init__(self, in_channels, out_channels, in_feature="res5"):
|
210 |
+
super().__init__()
|
211 |
+
self.num_levels = 2
|
212 |
+
self.in_feature = in_feature
|
213 |
+
self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1)
|
214 |
+
self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1)
|
215 |
+
for module in [self.p6, self.p7]:
|
216 |
+
weight_init.c2_xavier_fill(module)
|
217 |
+
|
218 |
+
def forward(self, c5):
|
219 |
+
p6 = self.p6(c5)
|
220 |
+
p7 = self.p7(F.relu(p6))
|
221 |
+
return [p6, p7]
|
222 |
+
|
223 |
+
|
224 |
+
@BACKBONE_REGISTRY.register()
|
225 |
+
def build_resnet_fpn_backbone(cfg, input_shape: ShapeSpec):
|
226 |
+
"""
|
227 |
+
Args:
|
228 |
+
cfg: a detectron2 CfgNode
|
229 |
+
|
230 |
+
Returns:
|
231 |
+
backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
|
232 |
+
"""
|
233 |
+
bottom_up = build_resnet_backbone(cfg, input_shape)
|
234 |
+
in_features = cfg.MODEL.FPN.IN_FEATURES
|
235 |
+
out_channels = cfg.MODEL.FPN.OUT_CHANNELS
|
236 |
+
backbone = FPN(
|
237 |
+
bottom_up=bottom_up,
|
238 |
+
in_features=in_features,
|
239 |
+
out_channels=out_channels,
|
240 |
+
norm=cfg.MODEL.FPN.NORM,
|
241 |
+
top_block=LastLevelMaxPool(),
|
242 |
+
fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
|
243 |
+
)
|
244 |
+
return backbone
|
245 |
+
|
246 |
+
|
247 |
+
@BACKBONE_REGISTRY.register()
|
248 |
+
def build_retinanet_resnet_fpn_backbone(cfg, input_shape: ShapeSpec):
|
249 |
+
"""
|
250 |
+
Args:
|
251 |
+
cfg: a detectron2 CfgNode
|
252 |
+
|
253 |
+
Returns:
|
254 |
+
backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
|
255 |
+
"""
|
256 |
+
bottom_up = build_resnet_backbone(cfg, input_shape)
|
257 |
+
in_features = cfg.MODEL.FPN.IN_FEATURES
|
258 |
+
out_channels = cfg.MODEL.FPN.OUT_CHANNELS
|
259 |
+
in_channels_p6p7 = bottom_up.output_shape()["res5"].channels
|
260 |
+
backbone = FPN(
|
261 |
+
bottom_up=bottom_up,
|
262 |
+
in_features=in_features,
|
263 |
+
out_channels=out_channels,
|
264 |
+
norm=cfg.MODEL.FPN.NORM,
|
265 |
+
top_block=LastLevelP6P7(in_channels_p6p7, out_channels),
|
266 |
+
fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
|
267 |
+
)
|
268 |
+
return backbone
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/mvit.py
ADDED
@@ -0,0 +1,448 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
import torch.nn as nn
|
5 |
+
|
6 |
+
from .backbone import Backbone
|
7 |
+
from .utils import (
|
8 |
+
PatchEmbed,
|
9 |
+
add_decomposed_rel_pos,
|
10 |
+
get_abs_pos,
|
11 |
+
window_partition,
|
12 |
+
window_unpartition,
|
13 |
+
)
|
14 |
+
|
15 |
+
logger = logging.getLogger(__name__)
|
16 |
+
|
17 |
+
|
18 |
+
__all__ = ["MViT"]
|
19 |
+
|
20 |
+
|
21 |
+
def attention_pool(x, pool, norm=None):
|
22 |
+
# (B, H, W, C) -> (B, C, H, W)
|
23 |
+
x = x.permute(0, 3, 1, 2)
|
24 |
+
x = pool(x)
|
25 |
+
# (B, C, H1, W1) -> (B, H1, W1, C)
|
26 |
+
x = x.permute(0, 2, 3, 1)
|
27 |
+
if norm:
|
28 |
+
x = norm(x)
|
29 |
+
|
30 |
+
return x
|
31 |
+
|
32 |
+
|
33 |
+
class MultiScaleAttention(nn.Module):
|
34 |
+
"""Multiscale Multi-head Attention block."""
|
35 |
+
|
36 |
+
def __init__(
|
37 |
+
self,
|
38 |
+
dim,
|
39 |
+
dim_out,
|
40 |
+
num_heads,
|
41 |
+
qkv_bias=True,
|
42 |
+
norm_layer=nn.LayerNorm,
|
43 |
+
pool_kernel=(3, 3),
|
44 |
+
stride_q=1,
|
45 |
+
stride_kv=1,
|
46 |
+
residual_pooling=True,
|
47 |
+
window_size=0,
|
48 |
+
use_rel_pos=False,
|
49 |
+
rel_pos_zero_init=True,
|
50 |
+
input_size=None,
|
51 |
+
):
|
52 |
+
"""
|
53 |
+
Args:
|
54 |
+
dim (int): Number of input channels.
|
55 |
+
dim_out (int): Number of output channels.
|
56 |
+
num_heads (int): Number of attention heads.
|
57 |
+
qkv_bias (bool: If True, add a learnable bias to query, key, value.
|
58 |
+
norm_layer (nn.Module): Normalization layer.
|
59 |
+
pool_kernel (tuple): kernel size for qkv pooling layers.
|
60 |
+
stride_q (int): stride size for q pooling layer.
|
61 |
+
stride_kv (int): stride size for kv pooling layer.
|
62 |
+
residual_pooling (bool): If true, enable residual pooling.
|
63 |
+
use_rel_pos (bool): If True, add relative postional embeddings to the attention map.
|
64 |
+
rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
|
65 |
+
input_size (int or None): Input resolution.
|
66 |
+
"""
|
67 |
+
super().__init__()
|
68 |
+
self.num_heads = num_heads
|
69 |
+
head_dim = dim_out // num_heads
|
70 |
+
self.scale = head_dim**-0.5
|
71 |
+
|
72 |
+
self.qkv = nn.Linear(dim, dim_out * 3, bias=qkv_bias)
|
73 |
+
self.proj = nn.Linear(dim_out, dim_out)
|
74 |
+
|
75 |
+
# qkv pooling
|
76 |
+
pool_padding = [k // 2 for k in pool_kernel]
|
77 |
+
dim_conv = dim_out // num_heads
|
78 |
+
self.pool_q = nn.Conv2d(
|
79 |
+
dim_conv,
|
80 |
+
dim_conv,
|
81 |
+
pool_kernel,
|
82 |
+
stride=stride_q,
|
83 |
+
padding=pool_padding,
|
84 |
+
groups=dim_conv,
|
85 |
+
bias=False,
|
86 |
+
)
|
87 |
+
self.norm_q = norm_layer(dim_conv)
|
88 |
+
self.pool_k = nn.Conv2d(
|
89 |
+
dim_conv,
|
90 |
+
dim_conv,
|
91 |
+
pool_kernel,
|
92 |
+
stride=stride_kv,
|
93 |
+
padding=pool_padding,
|
94 |
+
groups=dim_conv,
|
95 |
+
bias=False,
|
96 |
+
)
|
97 |
+
self.norm_k = norm_layer(dim_conv)
|
98 |
+
self.pool_v = nn.Conv2d(
|
99 |
+
dim_conv,
|
100 |
+
dim_conv,
|
101 |
+
pool_kernel,
|
102 |
+
stride=stride_kv,
|
103 |
+
padding=pool_padding,
|
104 |
+
groups=dim_conv,
|
105 |
+
bias=False,
|
106 |
+
)
|
107 |
+
self.norm_v = norm_layer(dim_conv)
|
108 |
+
|
109 |
+
self.window_size = window_size
|
110 |
+
if window_size:
|
111 |
+
self.q_win_size = window_size // stride_q
|
112 |
+
self.kv_win_size = window_size // stride_kv
|
113 |
+
self.residual_pooling = residual_pooling
|
114 |
+
|
115 |
+
self.use_rel_pos = use_rel_pos
|
116 |
+
if self.use_rel_pos:
|
117 |
+
# initialize relative positional embeddings
|
118 |
+
assert input_size[0] == input_size[1]
|
119 |
+
size = input_size[0]
|
120 |
+
rel_dim = 2 * max(size // stride_q, size // stride_kv) - 1
|
121 |
+
self.rel_pos_h = nn.Parameter(torch.zeros(rel_dim, head_dim))
|
122 |
+
self.rel_pos_w = nn.Parameter(torch.zeros(rel_dim, head_dim))
|
123 |
+
|
124 |
+
if not rel_pos_zero_init:
|
125 |
+
nn.init.trunc_normal_(self.rel_pos_h, std=0.02)
|
126 |
+
nn.init.trunc_normal_(self.rel_pos_w, std=0.02)
|
127 |
+
|
128 |
+
def forward(self, x):
|
129 |
+
B, H, W, _ = x.shape
|
130 |
+
# qkv with shape (3, B, nHead, H, W, C)
|
131 |
+
qkv = self.qkv(x).reshape(B, H, W, 3, self.num_heads, -1).permute(3, 0, 4, 1, 2, 5)
|
132 |
+
# q, k, v with shape (B * nHead, H, W, C)
|
133 |
+
q, k, v = qkv.reshape(3, B * self.num_heads, H, W, -1).unbind(0)
|
134 |
+
|
135 |
+
q = attention_pool(q, self.pool_q, self.norm_q)
|
136 |
+
k = attention_pool(k, self.pool_k, self.norm_k)
|
137 |
+
v = attention_pool(v, self.pool_v, self.norm_v)
|
138 |
+
|
139 |
+
ori_q = q
|
140 |
+
if self.window_size:
|
141 |
+
q, q_hw_pad = window_partition(q, self.q_win_size)
|
142 |
+
k, kv_hw_pad = window_partition(k, self.kv_win_size)
|
143 |
+
v, _ = window_partition(v, self.kv_win_size)
|
144 |
+
q_hw = (self.q_win_size, self.q_win_size)
|
145 |
+
kv_hw = (self.kv_win_size, self.kv_win_size)
|
146 |
+
else:
|
147 |
+
q_hw = q.shape[1:3]
|
148 |
+
kv_hw = k.shape[1:3]
|
149 |
+
|
150 |
+
q = q.view(q.shape[0], np.prod(q_hw), -1)
|
151 |
+
k = k.view(k.shape[0], np.prod(kv_hw), -1)
|
152 |
+
v = v.view(v.shape[0], np.prod(kv_hw), -1)
|
153 |
+
|
154 |
+
attn = (q * self.scale) @ k.transpose(-2, -1)
|
155 |
+
|
156 |
+
if self.use_rel_pos:
|
157 |
+
attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, q_hw, kv_hw)
|
158 |
+
|
159 |
+
attn = attn.softmax(dim=-1)
|
160 |
+
x = attn @ v
|
161 |
+
|
162 |
+
x = x.view(x.shape[0], q_hw[0], q_hw[1], -1)
|
163 |
+
|
164 |
+
if self.window_size:
|
165 |
+
x = window_unpartition(x, self.q_win_size, q_hw_pad, ori_q.shape[1:3])
|
166 |
+
|
167 |
+
if self.residual_pooling:
|
168 |
+
x += ori_q
|
169 |
+
|
170 |
+
H, W = x.shape[1], x.shape[2]
|
171 |
+
x = x.view(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1)
|
172 |
+
x = self.proj(x)
|
173 |
+
|
174 |
+
return x
|
175 |
+
|
176 |
+
|
177 |
+
class MultiScaleBlock(nn.Module):
|
178 |
+
"""Multiscale Transformer blocks"""
|
179 |
+
|
180 |
+
def __init__(
|
181 |
+
self,
|
182 |
+
dim,
|
183 |
+
dim_out,
|
184 |
+
num_heads,
|
185 |
+
mlp_ratio=4.0,
|
186 |
+
qkv_bias=True,
|
187 |
+
drop_path=0.0,
|
188 |
+
norm_layer=nn.LayerNorm,
|
189 |
+
act_layer=nn.GELU,
|
190 |
+
qkv_pool_kernel=(3, 3),
|
191 |
+
stride_q=1,
|
192 |
+
stride_kv=1,
|
193 |
+
residual_pooling=True,
|
194 |
+
window_size=0,
|
195 |
+
use_rel_pos=False,
|
196 |
+
rel_pos_zero_init=True,
|
197 |
+
input_size=None,
|
198 |
+
):
|
199 |
+
"""
|
200 |
+
Args:
|
201 |
+
dim (int): Number of input channels.
|
202 |
+
dim_out (int): Number of output channels.
|
203 |
+
num_heads (int): Number of attention heads in the MViT block.
|
204 |
+
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
|
205 |
+
qkv_bias (bool): If True, add a learnable bias to query, key, value.
|
206 |
+
drop_path (float): Stochastic depth rate.
|
207 |
+
norm_layer (nn.Module): Normalization layer.
|
208 |
+
act_layer (nn.Module): Activation layer.
|
209 |
+
qkv_pool_kernel (tuple): kernel size for qkv pooling layers.
|
210 |
+
stride_q (int): stride size for q pooling layer.
|
211 |
+
stride_kv (int): stride size for kv pooling layer.
|
212 |
+
residual_pooling (bool): If true, enable residual pooling.
|
213 |
+
window_size (int): Window size for window attention blocks. If it equals 0, then not
|
214 |
+
use window attention.
|
215 |
+
use_rel_pos (bool): If True, add relative postional embeddings to the attention map.
|
216 |
+
rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
|
217 |
+
input_size (int or None): Input resolution.
|
218 |
+
"""
|
219 |
+
super().__init__()
|
220 |
+
self.norm1 = norm_layer(dim)
|
221 |
+
self.attn = MultiScaleAttention(
|
222 |
+
dim,
|
223 |
+
dim_out,
|
224 |
+
num_heads=num_heads,
|
225 |
+
qkv_bias=qkv_bias,
|
226 |
+
norm_layer=norm_layer,
|
227 |
+
pool_kernel=qkv_pool_kernel,
|
228 |
+
stride_q=stride_q,
|
229 |
+
stride_kv=stride_kv,
|
230 |
+
residual_pooling=residual_pooling,
|
231 |
+
window_size=window_size,
|
232 |
+
use_rel_pos=use_rel_pos,
|
233 |
+
rel_pos_zero_init=rel_pos_zero_init,
|
234 |
+
input_size=input_size,
|
235 |
+
)
|
236 |
+
|
237 |
+
from timm.models.layers import DropPath, Mlp
|
238 |
+
|
239 |
+
self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
|
240 |
+
self.norm2 = norm_layer(dim_out)
|
241 |
+
self.mlp = Mlp(
|
242 |
+
in_features=dim_out,
|
243 |
+
hidden_features=int(dim_out * mlp_ratio),
|
244 |
+
out_features=dim_out,
|
245 |
+
act_layer=act_layer,
|
246 |
+
)
|
247 |
+
|
248 |
+
if dim != dim_out:
|
249 |
+
self.proj = nn.Linear(dim, dim_out)
|
250 |
+
|
251 |
+
if stride_q > 1:
|
252 |
+
kernel_skip = stride_q + 1
|
253 |
+
padding_skip = int(kernel_skip // 2)
|
254 |
+
self.pool_skip = nn.MaxPool2d(kernel_skip, stride_q, padding_skip, ceil_mode=False)
|
255 |
+
|
256 |
+
def forward(self, x):
|
257 |
+
x_norm = self.norm1(x)
|
258 |
+
x_block = self.attn(x_norm)
|
259 |
+
|
260 |
+
if hasattr(self, "proj"):
|
261 |
+
x = self.proj(x_norm)
|
262 |
+
if hasattr(self, "pool_skip"):
|
263 |
+
x = attention_pool(x, self.pool_skip)
|
264 |
+
|
265 |
+
x = x + self.drop_path(x_block)
|
266 |
+
x = x + self.drop_path(self.mlp(self.norm2(x)))
|
267 |
+
|
268 |
+
return x
|
269 |
+
|
270 |
+
|
271 |
+
class MViT(Backbone):
|
272 |
+
"""
|
273 |
+
This module implements Multiscale Vision Transformer (MViT) backbone in :paper:'mvitv2'.
|
274 |
+
"""
|
275 |
+
|
276 |
+
def __init__(
|
277 |
+
self,
|
278 |
+
img_size=224,
|
279 |
+
patch_kernel=(7, 7),
|
280 |
+
patch_stride=(4, 4),
|
281 |
+
patch_padding=(3, 3),
|
282 |
+
in_chans=3,
|
283 |
+
embed_dim=96,
|
284 |
+
depth=16,
|
285 |
+
num_heads=1,
|
286 |
+
last_block_indexes=(0, 2, 11, 15),
|
287 |
+
qkv_pool_kernel=(3, 3),
|
288 |
+
adaptive_kv_stride=4,
|
289 |
+
adaptive_window_size=56,
|
290 |
+
residual_pooling=True,
|
291 |
+
mlp_ratio=4.0,
|
292 |
+
qkv_bias=True,
|
293 |
+
drop_path_rate=0.0,
|
294 |
+
norm_layer=nn.LayerNorm,
|
295 |
+
act_layer=nn.GELU,
|
296 |
+
use_abs_pos=False,
|
297 |
+
use_rel_pos=True,
|
298 |
+
rel_pos_zero_init=True,
|
299 |
+
use_act_checkpoint=False,
|
300 |
+
pretrain_img_size=224,
|
301 |
+
pretrain_use_cls_token=True,
|
302 |
+
out_features=("scale2", "scale3", "scale4", "scale5"),
|
303 |
+
):
|
304 |
+
"""
|
305 |
+
Args:
|
306 |
+
img_size (int): Input image size.
|
307 |
+
patch_kernel (tuple): kernel size for patch embedding.
|
308 |
+
patch_stride (tuple): stride size for patch embedding.
|
309 |
+
patch_padding (tuple): padding size for patch embedding.
|
310 |
+
in_chans (int): Number of input image channels.
|
311 |
+
embed_dim (int): Patch embedding dimension.
|
312 |
+
depth (int): Depth of MViT.
|
313 |
+
num_heads (int): Number of base attention heads in each MViT block.
|
314 |
+
last_block_indexes (tuple): Block indexes for last blocks in each stage.
|
315 |
+
qkv_pool_kernel (tuple): kernel size for qkv pooling layers.
|
316 |
+
adaptive_kv_stride (int): adaptive stride size for kv pooling.
|
317 |
+
adaptive_window_size (int): adaptive window size for window attention blocks.
|
318 |
+
residual_pooling (bool): If true, enable residual pooling.
|
319 |
+
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
|
320 |
+
qkv_bias (bool): If True, add a learnable bias to query, key, value.
|
321 |
+
drop_path_rate (float): Stochastic depth rate.
|
322 |
+
norm_layer (nn.Module): Normalization layer.
|
323 |
+
act_layer (nn.Module): Activation layer.
|
324 |
+
use_abs_pos (bool): If True, use absolute positional embeddings.
|
325 |
+
use_rel_pos (bool): If True, add relative postional embeddings to the attention map.
|
326 |
+
rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
|
327 |
+
window_size (int): Window size for window attention blocks.
|
328 |
+
use_act_checkpoint (bool): If True, use activation checkpointing.
|
329 |
+
pretrain_img_size (int): input image size for pretraining models.
|
330 |
+
pretrain_use_cls_token (bool): If True, pretrainig models use class token.
|
331 |
+
out_features (tuple): name of the feature maps from each stage.
|
332 |
+
"""
|
333 |
+
super().__init__()
|
334 |
+
self.pretrain_use_cls_token = pretrain_use_cls_token
|
335 |
+
|
336 |
+
self.patch_embed = PatchEmbed(
|
337 |
+
kernel_size=patch_kernel,
|
338 |
+
stride=patch_stride,
|
339 |
+
padding=patch_padding,
|
340 |
+
in_chans=in_chans,
|
341 |
+
embed_dim=embed_dim,
|
342 |
+
)
|
343 |
+
|
344 |
+
if use_abs_pos:
|
345 |
+
# Initialize absoluate positional embedding with pretrain image size.
|
346 |
+
num_patches = (pretrain_img_size // patch_stride[0]) * (
|
347 |
+
pretrain_img_size // patch_stride[1]
|
348 |
+
)
|
349 |
+
num_positions = (num_patches + 1) if pretrain_use_cls_token else num_patches
|
350 |
+
self.pos_embed = nn.Parameter(torch.zeros(1, num_positions, embed_dim))
|
351 |
+
else:
|
352 |
+
self.pos_embed = None
|
353 |
+
|
354 |
+
# stochastic depth decay rule
|
355 |
+
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
|
356 |
+
dim_out = embed_dim
|
357 |
+
stride_kv = adaptive_kv_stride
|
358 |
+
window_size = adaptive_window_size
|
359 |
+
input_size = (img_size // patch_stride[0], img_size // patch_stride[1])
|
360 |
+
stage = 2
|
361 |
+
stride = patch_stride[0]
|
362 |
+
self._out_feature_strides = {}
|
363 |
+
self._out_feature_channels = {}
|
364 |
+
self.blocks = nn.ModuleList()
|
365 |
+
for i in range(depth):
|
366 |
+
# Multiply stride_kv by 2 if it's the last block of stage2 and stage3.
|
367 |
+
if i == last_block_indexes[1] or i == last_block_indexes[2]:
|
368 |
+
stride_kv_ = stride_kv * 2
|
369 |
+
else:
|
370 |
+
stride_kv_ = stride_kv
|
371 |
+
# hybrid window attention: global attention in last three stages.
|
372 |
+
window_size_ = 0 if i in last_block_indexes[1:] else window_size
|
373 |
+
block = MultiScaleBlock(
|
374 |
+
dim=embed_dim,
|
375 |
+
dim_out=dim_out,
|
376 |
+
num_heads=num_heads,
|
377 |
+
mlp_ratio=mlp_ratio,
|
378 |
+
qkv_bias=qkv_bias,
|
379 |
+
drop_path=dpr[i],
|
380 |
+
norm_layer=norm_layer,
|
381 |
+
qkv_pool_kernel=qkv_pool_kernel,
|
382 |
+
stride_q=2 if i - 1 in last_block_indexes else 1,
|
383 |
+
stride_kv=stride_kv_,
|
384 |
+
residual_pooling=residual_pooling,
|
385 |
+
window_size=window_size_,
|
386 |
+
use_rel_pos=use_rel_pos,
|
387 |
+
rel_pos_zero_init=rel_pos_zero_init,
|
388 |
+
input_size=input_size,
|
389 |
+
)
|
390 |
+
if use_act_checkpoint:
|
391 |
+
# TODO: use torch.utils.checkpoint
|
392 |
+
from fairscale.nn.checkpoint import checkpoint_wrapper
|
393 |
+
|
394 |
+
block = checkpoint_wrapper(block)
|
395 |
+
self.blocks.append(block)
|
396 |
+
|
397 |
+
embed_dim = dim_out
|
398 |
+
if i in last_block_indexes:
|
399 |
+
name = f"scale{stage}"
|
400 |
+
if name in out_features:
|
401 |
+
self._out_feature_channels[name] = dim_out
|
402 |
+
self._out_feature_strides[name] = stride
|
403 |
+
self.add_module(f"{name}_norm", norm_layer(dim_out))
|
404 |
+
|
405 |
+
dim_out *= 2
|
406 |
+
num_heads *= 2
|
407 |
+
stride_kv = max(stride_kv // 2, 1)
|
408 |
+
stride *= 2
|
409 |
+
stage += 1
|
410 |
+
if i - 1 in last_block_indexes:
|
411 |
+
window_size = window_size // 2
|
412 |
+
input_size = [s // 2 for s in input_size]
|
413 |
+
|
414 |
+
self._out_features = out_features
|
415 |
+
self._last_block_indexes = last_block_indexes
|
416 |
+
|
417 |
+
if self.pos_embed is not None:
|
418 |
+
nn.init.trunc_normal_(self.pos_embed, std=0.02)
|
419 |
+
|
420 |
+
self.apply(self._init_weights)
|
421 |
+
|
422 |
+
def _init_weights(self, m):
|
423 |
+
if isinstance(m, nn.Linear):
|
424 |
+
nn.init.trunc_normal_(m.weight, std=0.02)
|
425 |
+
if isinstance(m, nn.Linear) and m.bias is not None:
|
426 |
+
nn.init.constant_(m.bias, 0)
|
427 |
+
elif isinstance(m, nn.LayerNorm):
|
428 |
+
nn.init.constant_(m.bias, 0)
|
429 |
+
nn.init.constant_(m.weight, 1.0)
|
430 |
+
|
431 |
+
def forward(self, x):
|
432 |
+
x = self.patch_embed(x)
|
433 |
+
|
434 |
+
if self.pos_embed is not None:
|
435 |
+
x = x + get_abs_pos(self.pos_embed, self.pretrain_use_cls_token, x.shape[1:3])
|
436 |
+
|
437 |
+
outputs = {}
|
438 |
+
stage = 2
|
439 |
+
for i, blk in enumerate(self.blocks):
|
440 |
+
x = blk(x)
|
441 |
+
if i in self._last_block_indexes:
|
442 |
+
name = f"scale{stage}"
|
443 |
+
if name in self._out_features:
|
444 |
+
x_out = getattr(self, f"{name}_norm")(x)
|
445 |
+
outputs[name] = x_out.permute(0, 3, 1, 2)
|
446 |
+
stage += 1
|
447 |
+
|
448 |
+
return outputs
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/regnet.py
ADDED
@@ -0,0 +1,452 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
2 |
+
"""
|
3 |
+
Implementation of RegNet models from :paper:`dds` and :paper:`scaling`.
|
4 |
+
|
5 |
+
This code is adapted from https://github.com/facebookresearch/pycls with minimal modifications.
|
6 |
+
Some code duplication exists between RegNet and ResNets (e.g., ResStem) in order to simplify
|
7 |
+
model loading.
|
8 |
+
"""
|
9 |
+
|
10 |
+
import numpy as np
|
11 |
+
from torch import nn
|
12 |
+
|
13 |
+
from annotator.oneformer.detectron2.layers import CNNBlockBase, ShapeSpec, get_norm
|
14 |
+
|
15 |
+
from .backbone import Backbone
|
16 |
+
|
17 |
+
__all__ = [
|
18 |
+
"AnyNet",
|
19 |
+
"RegNet",
|
20 |
+
"ResStem",
|
21 |
+
"SimpleStem",
|
22 |
+
"VanillaBlock",
|
23 |
+
"ResBasicBlock",
|
24 |
+
"ResBottleneckBlock",
|
25 |
+
]
|
26 |
+
|
27 |
+
|
28 |
+
def conv2d(w_in, w_out, k, *, stride=1, groups=1, bias=False):
|
29 |
+
"""Helper for building a conv2d layer."""
|
30 |
+
assert k % 2 == 1, "Only odd size kernels supported to avoid padding issues."
|
31 |
+
s, p, g, b = stride, (k - 1) // 2, groups, bias
|
32 |
+
return nn.Conv2d(w_in, w_out, k, stride=s, padding=p, groups=g, bias=b)
|
33 |
+
|
34 |
+
|
35 |
+
def gap2d():
|
36 |
+
"""Helper for building a global average pooling layer."""
|
37 |
+
return nn.AdaptiveAvgPool2d((1, 1))
|
38 |
+
|
39 |
+
|
40 |
+
def pool2d(k, *, stride=1):
|
41 |
+
"""Helper for building a pool2d layer."""
|
42 |
+
assert k % 2 == 1, "Only odd size kernels supported to avoid padding issues."
|
43 |
+
return nn.MaxPool2d(k, stride=stride, padding=(k - 1) // 2)
|
44 |
+
|
45 |
+
|
46 |
+
def init_weights(m):
|
47 |
+
"""Performs ResNet-style weight initialization."""
|
48 |
+
if isinstance(m, nn.Conv2d):
|
49 |
+
# Note that there is no bias due to BN
|
50 |
+
fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
|
51 |
+
m.weight.data.normal_(mean=0.0, std=np.sqrt(2.0 / fan_out))
|
52 |
+
elif isinstance(m, nn.BatchNorm2d):
|
53 |
+
m.weight.data.fill_(1.0)
|
54 |
+
m.bias.data.zero_()
|
55 |
+
elif isinstance(m, nn.Linear):
|
56 |
+
m.weight.data.normal_(mean=0.0, std=0.01)
|
57 |
+
m.bias.data.zero_()
|
58 |
+
|
59 |
+
|
60 |
+
class ResStem(CNNBlockBase):
|
61 |
+
"""ResNet stem for ImageNet: 7x7, BN, AF, MaxPool."""
|
62 |
+
|
63 |
+
def __init__(self, w_in, w_out, norm, activation_class):
|
64 |
+
super().__init__(w_in, w_out, 4)
|
65 |
+
self.conv = conv2d(w_in, w_out, 7, stride=2)
|
66 |
+
self.bn = get_norm(norm, w_out)
|
67 |
+
self.af = activation_class()
|
68 |
+
self.pool = pool2d(3, stride=2)
|
69 |
+
|
70 |
+
def forward(self, x):
|
71 |
+
for layer in self.children():
|
72 |
+
x = layer(x)
|
73 |
+
return x
|
74 |
+
|
75 |
+
|
76 |
+
class SimpleStem(CNNBlockBase):
|
77 |
+
"""Simple stem for ImageNet: 3x3, BN, AF."""
|
78 |
+
|
79 |
+
def __init__(self, w_in, w_out, norm, activation_class):
|
80 |
+
super().__init__(w_in, w_out, 2)
|
81 |
+
self.conv = conv2d(w_in, w_out, 3, stride=2)
|
82 |
+
self.bn = get_norm(norm, w_out)
|
83 |
+
self.af = activation_class()
|
84 |
+
|
85 |
+
def forward(self, x):
|
86 |
+
for layer in self.children():
|
87 |
+
x = layer(x)
|
88 |
+
return x
|
89 |
+
|
90 |
+
|
91 |
+
class SE(nn.Module):
|
92 |
+
"""Squeeze-and-Excitation (SE) block: AvgPool, FC, Act, FC, Sigmoid."""
|
93 |
+
|
94 |
+
def __init__(self, w_in, w_se, activation_class):
|
95 |
+
super().__init__()
|
96 |
+
self.avg_pool = gap2d()
|
97 |
+
self.f_ex = nn.Sequential(
|
98 |
+
conv2d(w_in, w_se, 1, bias=True),
|
99 |
+
activation_class(),
|
100 |
+
conv2d(w_se, w_in, 1, bias=True),
|
101 |
+
nn.Sigmoid(),
|
102 |
+
)
|
103 |
+
|
104 |
+
def forward(self, x):
|
105 |
+
return x * self.f_ex(self.avg_pool(x))
|
106 |
+
|
107 |
+
|
108 |
+
class VanillaBlock(CNNBlockBase):
|
109 |
+
"""Vanilla block: [3x3 conv, BN, Relu] x2."""
|
110 |
+
|
111 |
+
def __init__(self, w_in, w_out, stride, norm, activation_class, _params):
|
112 |
+
super().__init__(w_in, w_out, stride)
|
113 |
+
self.a = conv2d(w_in, w_out, 3, stride=stride)
|
114 |
+
self.a_bn = get_norm(norm, w_out)
|
115 |
+
self.a_af = activation_class()
|
116 |
+
self.b = conv2d(w_out, w_out, 3)
|
117 |
+
self.b_bn = get_norm(norm, w_out)
|
118 |
+
self.b_af = activation_class()
|
119 |
+
|
120 |
+
def forward(self, x):
|
121 |
+
for layer in self.children():
|
122 |
+
x = layer(x)
|
123 |
+
return x
|
124 |
+
|
125 |
+
|
126 |
+
class BasicTransform(nn.Module):
|
127 |
+
"""Basic transformation: [3x3 conv, BN, Relu] x2."""
|
128 |
+
|
129 |
+
def __init__(self, w_in, w_out, stride, norm, activation_class, _params):
|
130 |
+
super().__init__()
|
131 |
+
self.a = conv2d(w_in, w_out, 3, stride=stride)
|
132 |
+
self.a_bn = get_norm(norm, w_out)
|
133 |
+
self.a_af = activation_class()
|
134 |
+
self.b = conv2d(w_out, w_out, 3)
|
135 |
+
self.b_bn = get_norm(norm, w_out)
|
136 |
+
self.b_bn.final_bn = True
|
137 |
+
|
138 |
+
def forward(self, x):
|
139 |
+
for layer in self.children():
|
140 |
+
x = layer(x)
|
141 |
+
return x
|
142 |
+
|
143 |
+
|
144 |
+
class ResBasicBlock(CNNBlockBase):
|
145 |
+
"""Residual basic block: x + f(x), f = basic transform."""
|
146 |
+
|
147 |
+
def __init__(self, w_in, w_out, stride, norm, activation_class, params):
|
148 |
+
super().__init__(w_in, w_out, stride)
|
149 |
+
self.proj, self.bn = None, None
|
150 |
+
if (w_in != w_out) or (stride != 1):
|
151 |
+
self.proj = conv2d(w_in, w_out, 1, stride=stride)
|
152 |
+
self.bn = get_norm(norm, w_out)
|
153 |
+
self.f = BasicTransform(w_in, w_out, stride, norm, activation_class, params)
|
154 |
+
self.af = activation_class()
|
155 |
+
|
156 |
+
def forward(self, x):
|
157 |
+
x_p = self.bn(self.proj(x)) if self.proj else x
|
158 |
+
return self.af(x_p + self.f(x))
|
159 |
+
|
160 |
+
|
161 |
+
class BottleneckTransform(nn.Module):
|
162 |
+
"""Bottleneck transformation: 1x1, 3x3 [+SE], 1x1."""
|
163 |
+
|
164 |
+
def __init__(self, w_in, w_out, stride, norm, activation_class, params):
|
165 |
+
super().__init__()
|
166 |
+
w_b = int(round(w_out * params["bot_mul"]))
|
167 |
+
w_se = int(round(w_in * params["se_r"]))
|
168 |
+
groups = w_b // params["group_w"]
|
169 |
+
self.a = conv2d(w_in, w_b, 1)
|
170 |
+
self.a_bn = get_norm(norm, w_b)
|
171 |
+
self.a_af = activation_class()
|
172 |
+
self.b = conv2d(w_b, w_b, 3, stride=stride, groups=groups)
|
173 |
+
self.b_bn = get_norm(norm, w_b)
|
174 |
+
self.b_af = activation_class()
|
175 |
+
self.se = SE(w_b, w_se, activation_class) if w_se else None
|
176 |
+
self.c = conv2d(w_b, w_out, 1)
|
177 |
+
self.c_bn = get_norm(norm, w_out)
|
178 |
+
self.c_bn.final_bn = True
|
179 |
+
|
180 |
+
def forward(self, x):
|
181 |
+
for layer in self.children():
|
182 |
+
x = layer(x)
|
183 |
+
return x
|
184 |
+
|
185 |
+
|
186 |
+
class ResBottleneckBlock(CNNBlockBase):
|
187 |
+
"""Residual bottleneck block: x + f(x), f = bottleneck transform."""
|
188 |
+
|
189 |
+
def __init__(self, w_in, w_out, stride, norm, activation_class, params):
|
190 |
+
super().__init__(w_in, w_out, stride)
|
191 |
+
self.proj, self.bn = None, None
|
192 |
+
if (w_in != w_out) or (stride != 1):
|
193 |
+
self.proj = conv2d(w_in, w_out, 1, stride=stride)
|
194 |
+
self.bn = get_norm(norm, w_out)
|
195 |
+
self.f = BottleneckTransform(w_in, w_out, stride, norm, activation_class, params)
|
196 |
+
self.af = activation_class()
|
197 |
+
|
198 |
+
def forward(self, x):
|
199 |
+
x_p = self.bn(self.proj(x)) if self.proj else x
|
200 |
+
return self.af(x_p + self.f(x))
|
201 |
+
|
202 |
+
|
203 |
+
class AnyStage(nn.Module):
|
204 |
+
"""AnyNet stage (sequence of blocks w/ the same output shape)."""
|
205 |
+
|
206 |
+
def __init__(self, w_in, w_out, stride, d, block_class, norm, activation_class, params):
|
207 |
+
super().__init__()
|
208 |
+
for i in range(d):
|
209 |
+
block = block_class(w_in, w_out, stride, norm, activation_class, params)
|
210 |
+
self.add_module("b{}".format(i + 1), block)
|
211 |
+
stride, w_in = 1, w_out
|
212 |
+
|
213 |
+
def forward(self, x):
|
214 |
+
for block in self.children():
|
215 |
+
x = block(x)
|
216 |
+
return x
|
217 |
+
|
218 |
+
|
219 |
+
class AnyNet(Backbone):
|
220 |
+
"""AnyNet model. See :paper:`dds`."""
|
221 |
+
|
222 |
+
def __init__(
|
223 |
+
self,
|
224 |
+
*,
|
225 |
+
stem_class,
|
226 |
+
stem_width,
|
227 |
+
block_class,
|
228 |
+
depths,
|
229 |
+
widths,
|
230 |
+
group_widths,
|
231 |
+
strides,
|
232 |
+
bottleneck_ratios,
|
233 |
+
se_ratio,
|
234 |
+
activation_class,
|
235 |
+
freeze_at=0,
|
236 |
+
norm="BN",
|
237 |
+
out_features=None,
|
238 |
+
):
|
239 |
+
"""
|
240 |
+
Args:
|
241 |
+
stem_class (callable): A callable taking 4 arguments (channels in, channels out,
|
242 |
+
normalization, callable returning an activation function) that returns another
|
243 |
+
callable implementing the stem module.
|
244 |
+
stem_width (int): The number of output channels that the stem produces.
|
245 |
+
block_class (callable): A callable taking 6 arguments (channels in, channels out,
|
246 |
+
stride, normalization, callable returning an activation function, a dict of
|
247 |
+
block-specific parameters) that returns another callable implementing the repeated
|
248 |
+
block module.
|
249 |
+
depths (list[int]): Number of blocks in each stage.
|
250 |
+
widths (list[int]): For each stage, the number of output channels of each block.
|
251 |
+
group_widths (list[int]): For each stage, the number of channels per group in group
|
252 |
+
convolution, if the block uses group convolution.
|
253 |
+
strides (list[int]): The stride that each network stage applies to its input.
|
254 |
+
bottleneck_ratios (list[float]): For each stage, the ratio of the number of bottleneck
|
255 |
+
channels to the number of block input channels (or, equivalently, output channels),
|
256 |
+
if the block uses a bottleneck.
|
257 |
+
se_ratio (float): The ratio of the number of channels used inside the squeeze-excitation
|
258 |
+
(SE) module to it number of input channels, if SE the block uses SE.
|
259 |
+
activation_class (callable): A callable taking no arguments that returns another
|
260 |
+
callable implementing an activation function.
|
261 |
+
freeze_at (int): The number of stages at the beginning to freeze.
|
262 |
+
see :meth:`freeze` for detailed explanation.
|
263 |
+
norm (str or callable): normalization for all conv layers.
|
264 |
+
See :func:`layers.get_norm` for supported format.
|
265 |
+
out_features (list[str]): name of the layers whose outputs should
|
266 |
+
be returned in forward. RegNet's use "stem" and "s1", "s2", etc for the stages after
|
267 |
+
the stem. If None, will return the output of the last layer.
|
268 |
+
"""
|
269 |
+
super().__init__()
|
270 |
+
self.stem = stem_class(3, stem_width, norm, activation_class)
|
271 |
+
|
272 |
+
current_stride = self.stem.stride
|
273 |
+
self._out_feature_strides = {"stem": current_stride}
|
274 |
+
self._out_feature_channels = {"stem": self.stem.out_channels}
|
275 |
+
self.stages_and_names = []
|
276 |
+
prev_w = stem_width
|
277 |
+
|
278 |
+
for i, (d, w, s, b, g) in enumerate(
|
279 |
+
zip(depths, widths, strides, bottleneck_ratios, group_widths)
|
280 |
+
):
|
281 |
+
params = {"bot_mul": b, "group_w": g, "se_r": se_ratio}
|
282 |
+
stage = AnyStage(prev_w, w, s, d, block_class, norm, activation_class, params)
|
283 |
+
name = "s{}".format(i + 1)
|
284 |
+
self.add_module(name, stage)
|
285 |
+
self.stages_and_names.append((stage, name))
|
286 |
+
self._out_feature_strides[name] = current_stride = int(
|
287 |
+
current_stride * np.prod([k.stride for k in stage.children()])
|
288 |
+
)
|
289 |
+
self._out_feature_channels[name] = list(stage.children())[-1].out_channels
|
290 |
+
prev_w = w
|
291 |
+
|
292 |
+
self.apply(init_weights)
|
293 |
+
|
294 |
+
if out_features is None:
|
295 |
+
out_features = [name]
|
296 |
+
self._out_features = out_features
|
297 |
+
assert len(self._out_features)
|
298 |
+
children = [x[0] for x in self.named_children()]
|
299 |
+
for out_feature in self._out_features:
|
300 |
+
assert out_feature in children, "Available children: {} does not include {}".format(
|
301 |
+
", ".join(children), out_feature
|
302 |
+
)
|
303 |
+
self.freeze(freeze_at)
|
304 |
+
|
305 |
+
def forward(self, x):
|
306 |
+
"""
|
307 |
+
Args:
|
308 |
+
x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
|
309 |
+
|
310 |
+
Returns:
|
311 |
+
dict[str->Tensor]: names and the corresponding features
|
312 |
+
"""
|
313 |
+
assert x.dim() == 4, f"Model takes an input of shape (N, C, H, W). Got {x.shape} instead!"
|
314 |
+
outputs = {}
|
315 |
+
x = self.stem(x)
|
316 |
+
if "stem" in self._out_features:
|
317 |
+
outputs["stem"] = x
|
318 |
+
for stage, name in self.stages_and_names:
|
319 |
+
x = stage(x)
|
320 |
+
if name in self._out_features:
|
321 |
+
outputs[name] = x
|
322 |
+
return outputs
|
323 |
+
|
324 |
+
def output_shape(self):
|
325 |
+
return {
|
326 |
+
name: ShapeSpec(
|
327 |
+
channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
|
328 |
+
)
|
329 |
+
for name in self._out_features
|
330 |
+
}
|
331 |
+
|
332 |
+
def freeze(self, freeze_at=0):
|
333 |
+
"""
|
334 |
+
Freeze the first several stages of the model. Commonly used in fine-tuning.
|
335 |
+
|
336 |
+
Layers that produce the same feature map spatial size are defined as one
|
337 |
+
"stage" by :paper:`FPN`.
|
338 |
+
|
339 |
+
Args:
|
340 |
+
freeze_at (int): number of stages to freeze.
|
341 |
+
`1` means freezing the stem. `2` means freezing the stem and
|
342 |
+
one residual stage, etc.
|
343 |
+
|
344 |
+
Returns:
|
345 |
+
nn.Module: this model itself
|
346 |
+
"""
|
347 |
+
if freeze_at >= 1:
|
348 |
+
self.stem.freeze()
|
349 |
+
for idx, (stage, _) in enumerate(self.stages_and_names, start=2):
|
350 |
+
if freeze_at >= idx:
|
351 |
+
for block in stage.children():
|
352 |
+
block.freeze()
|
353 |
+
return self
|
354 |
+
|
355 |
+
|
356 |
+
def adjust_block_compatibility(ws, bs, gs):
|
357 |
+
"""Adjusts the compatibility of widths, bottlenecks, and groups."""
|
358 |
+
assert len(ws) == len(bs) == len(gs)
|
359 |
+
assert all(w > 0 and b > 0 and g > 0 for w, b, g in zip(ws, bs, gs))
|
360 |
+
vs = [int(max(1, w * b)) for w, b in zip(ws, bs)]
|
361 |
+
gs = [int(min(g, v)) for g, v in zip(gs, vs)]
|
362 |
+
ms = [np.lcm(g, b) if b > 1 else g for g, b in zip(gs, bs)]
|
363 |
+
vs = [max(m, int(round(v / m) * m)) for v, m in zip(vs, ms)]
|
364 |
+
ws = [int(v / b) for v, b in zip(vs, bs)]
|
365 |
+
assert all(w * b % g == 0 for w, b, g in zip(ws, bs, gs))
|
366 |
+
return ws, bs, gs
|
367 |
+
|
368 |
+
|
369 |
+
def generate_regnet_parameters(w_a, w_0, w_m, d, q=8):
|
370 |
+
"""Generates per stage widths and depths from RegNet parameters."""
|
371 |
+
assert w_a >= 0 and w_0 > 0 and w_m > 1 and w_0 % q == 0
|
372 |
+
# Generate continuous per-block ws
|
373 |
+
ws_cont = np.arange(d) * w_a + w_0
|
374 |
+
# Generate quantized per-block ws
|
375 |
+
ks = np.round(np.log(ws_cont / w_0) / np.log(w_m))
|
376 |
+
ws_all = w_0 * np.power(w_m, ks)
|
377 |
+
ws_all = np.round(np.divide(ws_all, q)).astype(int) * q
|
378 |
+
# Generate per stage ws and ds (assumes ws_all are sorted)
|
379 |
+
ws, ds = np.unique(ws_all, return_counts=True)
|
380 |
+
# Compute number of actual stages and total possible stages
|
381 |
+
num_stages, total_stages = len(ws), ks.max() + 1
|
382 |
+
# Convert numpy arrays to lists and return
|
383 |
+
ws, ds, ws_all, ws_cont = (x.tolist() for x in (ws, ds, ws_all, ws_cont))
|
384 |
+
return ws, ds, num_stages, total_stages, ws_all, ws_cont
|
385 |
+
|
386 |
+
|
387 |
+
class RegNet(AnyNet):
|
388 |
+
"""RegNet model. See :paper:`dds`."""
|
389 |
+
|
390 |
+
def __init__(
|
391 |
+
self,
|
392 |
+
*,
|
393 |
+
stem_class,
|
394 |
+
stem_width,
|
395 |
+
block_class,
|
396 |
+
depth,
|
397 |
+
w_a,
|
398 |
+
w_0,
|
399 |
+
w_m,
|
400 |
+
group_width,
|
401 |
+
stride=2,
|
402 |
+
bottleneck_ratio=1.0,
|
403 |
+
se_ratio=0.0,
|
404 |
+
activation_class=None,
|
405 |
+
freeze_at=0,
|
406 |
+
norm="BN",
|
407 |
+
out_features=None,
|
408 |
+
):
|
409 |
+
"""
|
410 |
+
Build a RegNet from the parameterization described in :paper:`dds` Section 3.3.
|
411 |
+
|
412 |
+
Args:
|
413 |
+
See :class:`AnyNet` for arguments that are not listed here.
|
414 |
+
depth (int): Total number of blocks in the RegNet.
|
415 |
+
w_a (float): Factor by which block width would increase prior to quantizing block widths
|
416 |
+
by stage. See :paper:`dds` Section 3.3.
|
417 |
+
w_0 (int): Initial block width. See :paper:`dds` Section 3.3.
|
418 |
+
w_m (float): Parameter controlling block width quantization.
|
419 |
+
See :paper:`dds` Section 3.3.
|
420 |
+
group_width (int): Number of channels per group in group convolution, if the block uses
|
421 |
+
group convolution.
|
422 |
+
bottleneck_ratio (float): The ratio of the number of bottleneck channels to the number
|
423 |
+
of block input channels (or, equivalently, output channels), if the block uses a
|
424 |
+
bottleneck.
|
425 |
+
stride (int): The stride that each network stage applies to its input.
|
426 |
+
"""
|
427 |
+
ws, ds = generate_regnet_parameters(w_a, w_0, w_m, depth)[0:2]
|
428 |
+
ss = [stride for _ in ws]
|
429 |
+
bs = [bottleneck_ratio for _ in ws]
|
430 |
+
gs = [group_width for _ in ws]
|
431 |
+
ws, bs, gs = adjust_block_compatibility(ws, bs, gs)
|
432 |
+
|
433 |
+
def default_activation_class():
|
434 |
+
return nn.ReLU(inplace=True)
|
435 |
+
|
436 |
+
super().__init__(
|
437 |
+
stem_class=stem_class,
|
438 |
+
stem_width=stem_width,
|
439 |
+
block_class=block_class,
|
440 |
+
depths=ds,
|
441 |
+
widths=ws,
|
442 |
+
strides=ss,
|
443 |
+
group_widths=gs,
|
444 |
+
bottleneck_ratios=bs,
|
445 |
+
se_ratio=se_ratio,
|
446 |
+
activation_class=default_activation_class
|
447 |
+
if activation_class is None
|
448 |
+
else activation_class,
|
449 |
+
freeze_at=freeze_at,
|
450 |
+
norm=norm,
|
451 |
+
out_features=out_features,
|
452 |
+
)
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/resnet.py
ADDED
@@ -0,0 +1,694 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
import numpy as np
|
3 |
+
import fvcore.nn.weight_init as weight_init
|
4 |
+
import torch
|
5 |
+
import torch.nn.functional as F
|
6 |
+
from torch import nn
|
7 |
+
|
8 |
+
from annotator.oneformer.detectron2.layers import (
|
9 |
+
CNNBlockBase,
|
10 |
+
Conv2d,
|
11 |
+
DeformConv,
|
12 |
+
ModulatedDeformConv,
|
13 |
+
ShapeSpec,
|
14 |
+
get_norm,
|
15 |
+
)
|
16 |
+
|
17 |
+
from .backbone import Backbone
|
18 |
+
from .build import BACKBONE_REGISTRY
|
19 |
+
|
20 |
+
__all__ = [
|
21 |
+
"ResNetBlockBase",
|
22 |
+
"BasicBlock",
|
23 |
+
"BottleneckBlock",
|
24 |
+
"DeformBottleneckBlock",
|
25 |
+
"BasicStem",
|
26 |
+
"ResNet",
|
27 |
+
"make_stage",
|
28 |
+
"build_resnet_backbone",
|
29 |
+
]
|
30 |
+
|
31 |
+
|
32 |
+
class BasicBlock(CNNBlockBase):
|
33 |
+
"""
|
34 |
+
The basic residual block for ResNet-18 and ResNet-34 defined in :paper:`ResNet`,
|
35 |
+
with two 3x3 conv layers and a projection shortcut if needed.
|
36 |
+
"""
|
37 |
+
|
38 |
+
def __init__(self, in_channels, out_channels, *, stride=1, norm="BN"):
|
39 |
+
"""
|
40 |
+
Args:
|
41 |
+
in_channels (int): Number of input channels.
|
42 |
+
out_channels (int): Number of output channels.
|
43 |
+
stride (int): Stride for the first conv.
|
44 |
+
norm (str or callable): normalization for all conv layers.
|
45 |
+
See :func:`layers.get_norm` for supported format.
|
46 |
+
"""
|
47 |
+
super().__init__(in_channels, out_channels, stride)
|
48 |
+
|
49 |
+
if in_channels != out_channels:
|
50 |
+
self.shortcut = Conv2d(
|
51 |
+
in_channels,
|
52 |
+
out_channels,
|
53 |
+
kernel_size=1,
|
54 |
+
stride=stride,
|
55 |
+
bias=False,
|
56 |
+
norm=get_norm(norm, out_channels),
|
57 |
+
)
|
58 |
+
else:
|
59 |
+
self.shortcut = None
|
60 |
+
|
61 |
+
self.conv1 = Conv2d(
|
62 |
+
in_channels,
|
63 |
+
out_channels,
|
64 |
+
kernel_size=3,
|
65 |
+
stride=stride,
|
66 |
+
padding=1,
|
67 |
+
bias=False,
|
68 |
+
norm=get_norm(norm, out_channels),
|
69 |
+
)
|
70 |
+
|
71 |
+
self.conv2 = Conv2d(
|
72 |
+
out_channels,
|
73 |
+
out_channels,
|
74 |
+
kernel_size=3,
|
75 |
+
stride=1,
|
76 |
+
padding=1,
|
77 |
+
bias=False,
|
78 |
+
norm=get_norm(norm, out_channels),
|
79 |
+
)
|
80 |
+
|
81 |
+
for layer in [self.conv1, self.conv2, self.shortcut]:
|
82 |
+
if layer is not None: # shortcut can be None
|
83 |
+
weight_init.c2_msra_fill(layer)
|
84 |
+
|
85 |
+
def forward(self, x):
|
86 |
+
out = self.conv1(x)
|
87 |
+
out = F.relu_(out)
|
88 |
+
out = self.conv2(out)
|
89 |
+
|
90 |
+
if self.shortcut is not None:
|
91 |
+
shortcut = self.shortcut(x)
|
92 |
+
else:
|
93 |
+
shortcut = x
|
94 |
+
|
95 |
+
out += shortcut
|
96 |
+
out = F.relu_(out)
|
97 |
+
return out
|
98 |
+
|
99 |
+
|
100 |
+
class BottleneckBlock(CNNBlockBase):
|
101 |
+
"""
|
102 |
+
The standard bottleneck residual block used by ResNet-50, 101 and 152
|
103 |
+
defined in :paper:`ResNet`. It contains 3 conv layers with kernels
|
104 |
+
1x1, 3x3, 1x1, and a projection shortcut if needed.
|
105 |
+
"""
|
106 |
+
|
107 |
+
def __init__(
|
108 |
+
self,
|
109 |
+
in_channels,
|
110 |
+
out_channels,
|
111 |
+
*,
|
112 |
+
bottleneck_channels,
|
113 |
+
stride=1,
|
114 |
+
num_groups=1,
|
115 |
+
norm="BN",
|
116 |
+
stride_in_1x1=False,
|
117 |
+
dilation=1,
|
118 |
+
):
|
119 |
+
"""
|
120 |
+
Args:
|
121 |
+
bottleneck_channels (int): number of output channels for the 3x3
|
122 |
+
"bottleneck" conv layers.
|
123 |
+
num_groups (int): number of groups for the 3x3 conv layer.
|
124 |
+
norm (str or callable): normalization for all conv layers.
|
125 |
+
See :func:`layers.get_norm` for supported format.
|
126 |
+
stride_in_1x1 (bool): when stride>1, whether to put stride in the
|
127 |
+
first 1x1 convolution or the bottleneck 3x3 convolution.
|
128 |
+
dilation (int): the dilation rate of the 3x3 conv layer.
|
129 |
+
"""
|
130 |
+
super().__init__(in_channels, out_channels, stride)
|
131 |
+
|
132 |
+
if in_channels != out_channels:
|
133 |
+
self.shortcut = Conv2d(
|
134 |
+
in_channels,
|
135 |
+
out_channels,
|
136 |
+
kernel_size=1,
|
137 |
+
stride=stride,
|
138 |
+
bias=False,
|
139 |
+
norm=get_norm(norm, out_channels),
|
140 |
+
)
|
141 |
+
else:
|
142 |
+
self.shortcut = None
|
143 |
+
|
144 |
+
# The original MSRA ResNet models have stride in the first 1x1 conv
|
145 |
+
# The subsequent fb.torch.resnet and Caffe2 ResNe[X]t implementations have
|
146 |
+
# stride in the 3x3 conv
|
147 |
+
stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride)
|
148 |
+
|
149 |
+
self.conv1 = Conv2d(
|
150 |
+
in_channels,
|
151 |
+
bottleneck_channels,
|
152 |
+
kernel_size=1,
|
153 |
+
stride=stride_1x1,
|
154 |
+
bias=False,
|
155 |
+
norm=get_norm(norm, bottleneck_channels),
|
156 |
+
)
|
157 |
+
|
158 |
+
self.conv2 = Conv2d(
|
159 |
+
bottleneck_channels,
|
160 |
+
bottleneck_channels,
|
161 |
+
kernel_size=3,
|
162 |
+
stride=stride_3x3,
|
163 |
+
padding=1 * dilation,
|
164 |
+
bias=False,
|
165 |
+
groups=num_groups,
|
166 |
+
dilation=dilation,
|
167 |
+
norm=get_norm(norm, bottleneck_channels),
|
168 |
+
)
|
169 |
+
|
170 |
+
self.conv3 = Conv2d(
|
171 |
+
bottleneck_channels,
|
172 |
+
out_channels,
|
173 |
+
kernel_size=1,
|
174 |
+
bias=False,
|
175 |
+
norm=get_norm(norm, out_channels),
|
176 |
+
)
|
177 |
+
|
178 |
+
for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]:
|
179 |
+
if layer is not None: # shortcut can be None
|
180 |
+
weight_init.c2_msra_fill(layer)
|
181 |
+
|
182 |
+
# Zero-initialize the last normalization in each residual branch,
|
183 |
+
# so that at the beginning, the residual branch starts with zeros,
|
184 |
+
# and each residual block behaves like an identity.
|
185 |
+
# See Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour":
|
186 |
+
# "For BN layers, the learnable scaling coefficient γ is initialized
|
187 |
+
# to be 1, except for each residual block's last BN
|
188 |
+
# where γ is initialized to be 0."
|
189 |
+
|
190 |
+
# nn.init.constant_(self.conv3.norm.weight, 0)
|
191 |
+
# TODO this somehow hurts performance when training GN models from scratch.
|
192 |
+
# Add it as an option when we need to use this code to train a backbone.
|
193 |
+
|
194 |
+
def forward(self, x):
|
195 |
+
out = self.conv1(x)
|
196 |
+
out = F.relu_(out)
|
197 |
+
|
198 |
+
out = self.conv2(out)
|
199 |
+
out = F.relu_(out)
|
200 |
+
|
201 |
+
out = self.conv3(out)
|
202 |
+
|
203 |
+
if self.shortcut is not None:
|
204 |
+
shortcut = self.shortcut(x)
|
205 |
+
else:
|
206 |
+
shortcut = x
|
207 |
+
|
208 |
+
out += shortcut
|
209 |
+
out = F.relu_(out)
|
210 |
+
return out
|
211 |
+
|
212 |
+
|
213 |
+
class DeformBottleneckBlock(CNNBlockBase):
|
214 |
+
"""
|
215 |
+
Similar to :class:`BottleneckBlock`, but with :paper:`deformable conv <deformconv>`
|
216 |
+
in the 3x3 convolution.
|
217 |
+
"""
|
218 |
+
|
219 |
+
def __init__(
|
220 |
+
self,
|
221 |
+
in_channels,
|
222 |
+
out_channels,
|
223 |
+
*,
|
224 |
+
bottleneck_channels,
|
225 |
+
stride=1,
|
226 |
+
num_groups=1,
|
227 |
+
norm="BN",
|
228 |
+
stride_in_1x1=False,
|
229 |
+
dilation=1,
|
230 |
+
deform_modulated=False,
|
231 |
+
deform_num_groups=1,
|
232 |
+
):
|
233 |
+
super().__init__(in_channels, out_channels, stride)
|
234 |
+
self.deform_modulated = deform_modulated
|
235 |
+
|
236 |
+
if in_channels != out_channels:
|
237 |
+
self.shortcut = Conv2d(
|
238 |
+
in_channels,
|
239 |
+
out_channels,
|
240 |
+
kernel_size=1,
|
241 |
+
stride=stride,
|
242 |
+
bias=False,
|
243 |
+
norm=get_norm(norm, out_channels),
|
244 |
+
)
|
245 |
+
else:
|
246 |
+
self.shortcut = None
|
247 |
+
|
248 |
+
stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride)
|
249 |
+
|
250 |
+
self.conv1 = Conv2d(
|
251 |
+
in_channels,
|
252 |
+
bottleneck_channels,
|
253 |
+
kernel_size=1,
|
254 |
+
stride=stride_1x1,
|
255 |
+
bias=False,
|
256 |
+
norm=get_norm(norm, bottleneck_channels),
|
257 |
+
)
|
258 |
+
|
259 |
+
if deform_modulated:
|
260 |
+
deform_conv_op = ModulatedDeformConv
|
261 |
+
# offset channels are 2 or 3 (if with modulated) * kernel_size * kernel_size
|
262 |
+
offset_channels = 27
|
263 |
+
else:
|
264 |
+
deform_conv_op = DeformConv
|
265 |
+
offset_channels = 18
|
266 |
+
|
267 |
+
self.conv2_offset = Conv2d(
|
268 |
+
bottleneck_channels,
|
269 |
+
offset_channels * deform_num_groups,
|
270 |
+
kernel_size=3,
|
271 |
+
stride=stride_3x3,
|
272 |
+
padding=1 * dilation,
|
273 |
+
dilation=dilation,
|
274 |
+
)
|
275 |
+
self.conv2 = deform_conv_op(
|
276 |
+
bottleneck_channels,
|
277 |
+
bottleneck_channels,
|
278 |
+
kernel_size=3,
|
279 |
+
stride=stride_3x3,
|
280 |
+
padding=1 * dilation,
|
281 |
+
bias=False,
|
282 |
+
groups=num_groups,
|
283 |
+
dilation=dilation,
|
284 |
+
deformable_groups=deform_num_groups,
|
285 |
+
norm=get_norm(norm, bottleneck_channels),
|
286 |
+
)
|
287 |
+
|
288 |
+
self.conv3 = Conv2d(
|
289 |
+
bottleneck_channels,
|
290 |
+
out_channels,
|
291 |
+
kernel_size=1,
|
292 |
+
bias=False,
|
293 |
+
norm=get_norm(norm, out_channels),
|
294 |
+
)
|
295 |
+
|
296 |
+
for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]:
|
297 |
+
if layer is not None: # shortcut can be None
|
298 |
+
weight_init.c2_msra_fill(layer)
|
299 |
+
|
300 |
+
nn.init.constant_(self.conv2_offset.weight, 0)
|
301 |
+
nn.init.constant_(self.conv2_offset.bias, 0)
|
302 |
+
|
303 |
+
def forward(self, x):
|
304 |
+
out = self.conv1(x)
|
305 |
+
out = F.relu_(out)
|
306 |
+
|
307 |
+
if self.deform_modulated:
|
308 |
+
offset_mask = self.conv2_offset(out)
|
309 |
+
offset_x, offset_y, mask = torch.chunk(offset_mask, 3, dim=1)
|
310 |
+
offset = torch.cat((offset_x, offset_y), dim=1)
|
311 |
+
mask = mask.sigmoid()
|
312 |
+
out = self.conv2(out, offset, mask)
|
313 |
+
else:
|
314 |
+
offset = self.conv2_offset(out)
|
315 |
+
out = self.conv2(out, offset)
|
316 |
+
out = F.relu_(out)
|
317 |
+
|
318 |
+
out = self.conv3(out)
|
319 |
+
|
320 |
+
if self.shortcut is not None:
|
321 |
+
shortcut = self.shortcut(x)
|
322 |
+
else:
|
323 |
+
shortcut = x
|
324 |
+
|
325 |
+
out += shortcut
|
326 |
+
out = F.relu_(out)
|
327 |
+
return out
|
328 |
+
|
329 |
+
|
330 |
+
class BasicStem(CNNBlockBase):
|
331 |
+
"""
|
332 |
+
The standard ResNet stem (layers before the first residual block),
|
333 |
+
with a conv, relu and max_pool.
|
334 |
+
"""
|
335 |
+
|
336 |
+
def __init__(self, in_channels=3, out_channels=64, norm="BN"):
|
337 |
+
"""
|
338 |
+
Args:
|
339 |
+
norm (str or callable): norm after the first conv layer.
|
340 |
+
See :func:`layers.get_norm` for supported format.
|
341 |
+
"""
|
342 |
+
super().__init__(in_channels, out_channels, 4)
|
343 |
+
self.in_channels = in_channels
|
344 |
+
self.conv1 = Conv2d(
|
345 |
+
in_channels,
|
346 |
+
out_channels,
|
347 |
+
kernel_size=7,
|
348 |
+
stride=2,
|
349 |
+
padding=3,
|
350 |
+
bias=False,
|
351 |
+
norm=get_norm(norm, out_channels),
|
352 |
+
)
|
353 |
+
weight_init.c2_msra_fill(self.conv1)
|
354 |
+
|
355 |
+
def forward(self, x):
|
356 |
+
x = self.conv1(x)
|
357 |
+
x = F.relu_(x)
|
358 |
+
x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)
|
359 |
+
return x
|
360 |
+
|
361 |
+
|
362 |
+
class ResNet(Backbone):
|
363 |
+
"""
|
364 |
+
Implement :paper:`ResNet`.
|
365 |
+
"""
|
366 |
+
|
367 |
+
def __init__(self, stem, stages, num_classes=None, out_features=None, freeze_at=0):
|
368 |
+
"""
|
369 |
+
Args:
|
370 |
+
stem (nn.Module): a stem module
|
371 |
+
stages (list[list[CNNBlockBase]]): several (typically 4) stages,
|
372 |
+
each contains multiple :class:`CNNBlockBase`.
|
373 |
+
num_classes (None or int): if None, will not perform classification.
|
374 |
+
Otherwise, will create a linear layer.
|
375 |
+
out_features (list[str]): name of the layers whose outputs should
|
376 |
+
be returned in forward. Can be anything in "stem", "linear", or "res2" ...
|
377 |
+
If None, will return the output of the last layer.
|
378 |
+
freeze_at (int): The number of stages at the beginning to freeze.
|
379 |
+
see :meth:`freeze` for detailed explanation.
|
380 |
+
"""
|
381 |
+
super().__init__()
|
382 |
+
self.stem = stem
|
383 |
+
self.num_classes = num_classes
|
384 |
+
|
385 |
+
current_stride = self.stem.stride
|
386 |
+
self._out_feature_strides = {"stem": current_stride}
|
387 |
+
self._out_feature_channels = {"stem": self.stem.out_channels}
|
388 |
+
|
389 |
+
self.stage_names, self.stages = [], []
|
390 |
+
|
391 |
+
if out_features is not None:
|
392 |
+
# Avoid keeping unused layers in this module. They consume extra memory
|
393 |
+
# and may cause allreduce to fail
|
394 |
+
num_stages = max(
|
395 |
+
[{"res2": 1, "res3": 2, "res4": 3, "res5": 4}.get(f, 0) for f in out_features]
|
396 |
+
)
|
397 |
+
stages = stages[:num_stages]
|
398 |
+
for i, blocks in enumerate(stages):
|
399 |
+
assert len(blocks) > 0, len(blocks)
|
400 |
+
for block in blocks:
|
401 |
+
assert isinstance(block, CNNBlockBase), block
|
402 |
+
|
403 |
+
name = "res" + str(i + 2)
|
404 |
+
stage = nn.Sequential(*blocks)
|
405 |
+
|
406 |
+
self.add_module(name, stage)
|
407 |
+
self.stage_names.append(name)
|
408 |
+
self.stages.append(stage)
|
409 |
+
|
410 |
+
self._out_feature_strides[name] = current_stride = int(
|
411 |
+
current_stride * np.prod([k.stride for k in blocks])
|
412 |
+
)
|
413 |
+
self._out_feature_channels[name] = curr_channels = blocks[-1].out_channels
|
414 |
+
self.stage_names = tuple(self.stage_names) # Make it static for scripting
|
415 |
+
|
416 |
+
if num_classes is not None:
|
417 |
+
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
|
418 |
+
self.linear = nn.Linear(curr_channels, num_classes)
|
419 |
+
|
420 |
+
# Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour":
|
421 |
+
# "The 1000-way fully-connected layer is initialized by
|
422 |
+
# drawing weights from a zero-mean Gaussian with standard deviation of 0.01."
|
423 |
+
nn.init.normal_(self.linear.weight, std=0.01)
|
424 |
+
name = "linear"
|
425 |
+
|
426 |
+
if out_features is None:
|
427 |
+
out_features = [name]
|
428 |
+
self._out_features = out_features
|
429 |
+
assert len(self._out_features)
|
430 |
+
children = [x[0] for x in self.named_children()]
|
431 |
+
for out_feature in self._out_features:
|
432 |
+
assert out_feature in children, "Available children: {}".format(", ".join(children))
|
433 |
+
self.freeze(freeze_at)
|
434 |
+
|
435 |
+
def forward(self, x):
|
436 |
+
"""
|
437 |
+
Args:
|
438 |
+
x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
|
439 |
+
|
440 |
+
Returns:
|
441 |
+
dict[str->Tensor]: names and the corresponding features
|
442 |
+
"""
|
443 |
+
assert x.dim() == 4, f"ResNet takes an input of shape (N, C, H, W). Got {x.shape} instead!"
|
444 |
+
outputs = {}
|
445 |
+
x = self.stem(x)
|
446 |
+
if "stem" in self._out_features:
|
447 |
+
outputs["stem"] = x
|
448 |
+
for name, stage in zip(self.stage_names, self.stages):
|
449 |
+
x = stage(x)
|
450 |
+
if name in self._out_features:
|
451 |
+
outputs[name] = x
|
452 |
+
if self.num_classes is not None:
|
453 |
+
x = self.avgpool(x)
|
454 |
+
x = torch.flatten(x, 1)
|
455 |
+
x = self.linear(x)
|
456 |
+
if "linear" in self._out_features:
|
457 |
+
outputs["linear"] = x
|
458 |
+
return outputs
|
459 |
+
|
460 |
+
def output_shape(self):
|
461 |
+
return {
|
462 |
+
name: ShapeSpec(
|
463 |
+
channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
|
464 |
+
)
|
465 |
+
for name in self._out_features
|
466 |
+
}
|
467 |
+
|
468 |
+
def freeze(self, freeze_at=0):
|
469 |
+
"""
|
470 |
+
Freeze the first several stages of the ResNet. Commonly used in
|
471 |
+
fine-tuning.
|
472 |
+
|
473 |
+
Layers that produce the same feature map spatial size are defined as one
|
474 |
+
"stage" by :paper:`FPN`.
|
475 |
+
|
476 |
+
Args:
|
477 |
+
freeze_at (int): number of stages to freeze.
|
478 |
+
`1` means freezing the stem. `2` means freezing the stem and
|
479 |
+
one residual stage, etc.
|
480 |
+
|
481 |
+
Returns:
|
482 |
+
nn.Module: this ResNet itself
|
483 |
+
"""
|
484 |
+
if freeze_at >= 1:
|
485 |
+
self.stem.freeze()
|
486 |
+
for idx, stage in enumerate(self.stages, start=2):
|
487 |
+
if freeze_at >= idx:
|
488 |
+
for block in stage.children():
|
489 |
+
block.freeze()
|
490 |
+
return self
|
491 |
+
|
492 |
+
@staticmethod
|
493 |
+
def make_stage(block_class, num_blocks, *, in_channels, out_channels, **kwargs):
|
494 |
+
"""
|
495 |
+
Create a list of blocks of the same type that forms one ResNet stage.
|
496 |
+
|
497 |
+
Args:
|
498 |
+
block_class (type): a subclass of CNNBlockBase that's used to create all blocks in this
|
499 |
+
stage. A module of this type must not change spatial resolution of inputs unless its
|
500 |
+
stride != 1.
|
501 |
+
num_blocks (int): number of blocks in this stage
|
502 |
+
in_channels (int): input channels of the entire stage.
|
503 |
+
out_channels (int): output channels of **every block** in the stage.
|
504 |
+
kwargs: other arguments passed to the constructor of
|
505 |
+
`block_class`. If the argument name is "xx_per_block", the
|
506 |
+
argument is a list of values to be passed to each block in the
|
507 |
+
stage. Otherwise, the same argument is passed to every block
|
508 |
+
in the stage.
|
509 |
+
|
510 |
+
Returns:
|
511 |
+
list[CNNBlockBase]: a list of block module.
|
512 |
+
|
513 |
+
Examples:
|
514 |
+
::
|
515 |
+
stage = ResNet.make_stage(
|
516 |
+
BottleneckBlock, 3, in_channels=16, out_channels=64,
|
517 |
+
bottleneck_channels=16, num_groups=1,
|
518 |
+
stride_per_block=[2, 1, 1],
|
519 |
+
dilations_per_block=[1, 1, 2]
|
520 |
+
)
|
521 |
+
|
522 |
+
Usually, layers that produce the same feature map spatial size are defined as one
|
523 |
+
"stage" (in :paper:`FPN`). Under such definition, ``stride_per_block[1:]`` should
|
524 |
+
all be 1.
|
525 |
+
"""
|
526 |
+
blocks = []
|
527 |
+
for i in range(num_blocks):
|
528 |
+
curr_kwargs = {}
|
529 |
+
for k, v in kwargs.items():
|
530 |
+
if k.endswith("_per_block"):
|
531 |
+
assert len(v) == num_blocks, (
|
532 |
+
f"Argument '{k}' of make_stage should have the "
|
533 |
+
f"same length as num_blocks={num_blocks}."
|
534 |
+
)
|
535 |
+
newk = k[: -len("_per_block")]
|
536 |
+
assert newk not in kwargs, f"Cannot call make_stage with both {k} and {newk}!"
|
537 |
+
curr_kwargs[newk] = v[i]
|
538 |
+
else:
|
539 |
+
curr_kwargs[k] = v
|
540 |
+
|
541 |
+
blocks.append(
|
542 |
+
block_class(in_channels=in_channels, out_channels=out_channels, **curr_kwargs)
|
543 |
+
)
|
544 |
+
in_channels = out_channels
|
545 |
+
return blocks
|
546 |
+
|
547 |
+
@staticmethod
|
548 |
+
def make_default_stages(depth, block_class=None, **kwargs):
|
549 |
+
"""
|
550 |
+
Created list of ResNet stages from pre-defined depth (one of 18, 34, 50, 101, 152).
|
551 |
+
If it doesn't create the ResNet variant you need, please use :meth:`make_stage`
|
552 |
+
instead for fine-grained customization.
|
553 |
+
|
554 |
+
Args:
|
555 |
+
depth (int): depth of ResNet
|
556 |
+
block_class (type): the CNN block class. Has to accept
|
557 |
+
`bottleneck_channels` argument for depth > 50.
|
558 |
+
By default it is BasicBlock or BottleneckBlock, based on the
|
559 |
+
depth.
|
560 |
+
kwargs:
|
561 |
+
other arguments to pass to `make_stage`. Should not contain
|
562 |
+
stride and channels, as they are predefined for each depth.
|
563 |
+
|
564 |
+
Returns:
|
565 |
+
list[list[CNNBlockBase]]: modules in all stages; see arguments of
|
566 |
+
:class:`ResNet.__init__`.
|
567 |
+
"""
|
568 |
+
num_blocks_per_stage = {
|
569 |
+
18: [2, 2, 2, 2],
|
570 |
+
34: [3, 4, 6, 3],
|
571 |
+
50: [3, 4, 6, 3],
|
572 |
+
101: [3, 4, 23, 3],
|
573 |
+
152: [3, 8, 36, 3],
|
574 |
+
}[depth]
|
575 |
+
if block_class is None:
|
576 |
+
block_class = BasicBlock if depth < 50 else BottleneckBlock
|
577 |
+
if depth < 50:
|
578 |
+
in_channels = [64, 64, 128, 256]
|
579 |
+
out_channels = [64, 128, 256, 512]
|
580 |
+
else:
|
581 |
+
in_channels = [64, 256, 512, 1024]
|
582 |
+
out_channels = [256, 512, 1024, 2048]
|
583 |
+
ret = []
|
584 |
+
for (n, s, i, o) in zip(num_blocks_per_stage, [1, 2, 2, 2], in_channels, out_channels):
|
585 |
+
if depth >= 50:
|
586 |
+
kwargs["bottleneck_channels"] = o // 4
|
587 |
+
ret.append(
|
588 |
+
ResNet.make_stage(
|
589 |
+
block_class=block_class,
|
590 |
+
num_blocks=n,
|
591 |
+
stride_per_block=[s] + [1] * (n - 1),
|
592 |
+
in_channels=i,
|
593 |
+
out_channels=o,
|
594 |
+
**kwargs,
|
595 |
+
)
|
596 |
+
)
|
597 |
+
return ret
|
598 |
+
|
599 |
+
|
600 |
+
ResNetBlockBase = CNNBlockBase
|
601 |
+
"""
|
602 |
+
Alias for backward compatibiltiy.
|
603 |
+
"""
|
604 |
+
|
605 |
+
|
606 |
+
def make_stage(*args, **kwargs):
|
607 |
+
"""
|
608 |
+
Deprecated alias for backward compatibiltiy.
|
609 |
+
"""
|
610 |
+
return ResNet.make_stage(*args, **kwargs)
|
611 |
+
|
612 |
+
|
613 |
+
@BACKBONE_REGISTRY.register()
|
614 |
+
def build_resnet_backbone(cfg, input_shape):
|
615 |
+
"""
|
616 |
+
Create a ResNet instance from config.
|
617 |
+
|
618 |
+
Returns:
|
619 |
+
ResNet: a :class:`ResNet` instance.
|
620 |
+
"""
|
621 |
+
# need registration of new blocks/stems?
|
622 |
+
norm = cfg.MODEL.RESNETS.NORM
|
623 |
+
stem = BasicStem(
|
624 |
+
in_channels=input_shape.channels,
|
625 |
+
out_channels=cfg.MODEL.RESNETS.STEM_OUT_CHANNELS,
|
626 |
+
norm=norm,
|
627 |
+
)
|
628 |
+
|
629 |
+
# fmt: off
|
630 |
+
freeze_at = cfg.MODEL.BACKBONE.FREEZE_AT
|
631 |
+
out_features = cfg.MODEL.RESNETS.OUT_FEATURES
|
632 |
+
depth = cfg.MODEL.RESNETS.DEPTH
|
633 |
+
num_groups = cfg.MODEL.RESNETS.NUM_GROUPS
|
634 |
+
width_per_group = cfg.MODEL.RESNETS.WIDTH_PER_GROUP
|
635 |
+
bottleneck_channels = num_groups * width_per_group
|
636 |
+
in_channels = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS
|
637 |
+
out_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS
|
638 |
+
stride_in_1x1 = cfg.MODEL.RESNETS.STRIDE_IN_1X1
|
639 |
+
res5_dilation = cfg.MODEL.RESNETS.RES5_DILATION
|
640 |
+
deform_on_per_stage = cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE
|
641 |
+
deform_modulated = cfg.MODEL.RESNETS.DEFORM_MODULATED
|
642 |
+
deform_num_groups = cfg.MODEL.RESNETS.DEFORM_NUM_GROUPS
|
643 |
+
# fmt: on
|
644 |
+
assert res5_dilation in {1, 2}, "res5_dilation cannot be {}.".format(res5_dilation)
|
645 |
+
|
646 |
+
num_blocks_per_stage = {
|
647 |
+
18: [2, 2, 2, 2],
|
648 |
+
34: [3, 4, 6, 3],
|
649 |
+
50: [3, 4, 6, 3],
|
650 |
+
101: [3, 4, 23, 3],
|
651 |
+
152: [3, 8, 36, 3],
|
652 |
+
}[depth]
|
653 |
+
|
654 |
+
if depth in [18, 34]:
|
655 |
+
assert out_channels == 64, "Must set MODEL.RESNETS.RES2_OUT_CHANNELS = 64 for R18/R34"
|
656 |
+
assert not any(
|
657 |
+
deform_on_per_stage
|
658 |
+
), "MODEL.RESNETS.DEFORM_ON_PER_STAGE unsupported for R18/R34"
|
659 |
+
assert res5_dilation == 1, "Must set MODEL.RESNETS.RES5_DILATION = 1 for R18/R34"
|
660 |
+
assert num_groups == 1, "Must set MODEL.RESNETS.NUM_GROUPS = 1 for R18/R34"
|
661 |
+
|
662 |
+
stages = []
|
663 |
+
|
664 |
+
for idx, stage_idx in enumerate(range(2, 6)):
|
665 |
+
# res5_dilation is used this way as a convention in R-FCN & Deformable Conv paper
|
666 |
+
dilation = res5_dilation if stage_idx == 5 else 1
|
667 |
+
first_stride = 1 if idx == 0 or (stage_idx == 5 and dilation == 2) else 2
|
668 |
+
stage_kargs = {
|
669 |
+
"num_blocks": num_blocks_per_stage[idx],
|
670 |
+
"stride_per_block": [first_stride] + [1] * (num_blocks_per_stage[idx] - 1),
|
671 |
+
"in_channels": in_channels,
|
672 |
+
"out_channels": out_channels,
|
673 |
+
"norm": norm,
|
674 |
+
}
|
675 |
+
# Use BasicBlock for R18 and R34.
|
676 |
+
if depth in [18, 34]:
|
677 |
+
stage_kargs["block_class"] = BasicBlock
|
678 |
+
else:
|
679 |
+
stage_kargs["bottleneck_channels"] = bottleneck_channels
|
680 |
+
stage_kargs["stride_in_1x1"] = stride_in_1x1
|
681 |
+
stage_kargs["dilation"] = dilation
|
682 |
+
stage_kargs["num_groups"] = num_groups
|
683 |
+
if deform_on_per_stage[idx]:
|
684 |
+
stage_kargs["block_class"] = DeformBottleneckBlock
|
685 |
+
stage_kargs["deform_modulated"] = deform_modulated
|
686 |
+
stage_kargs["deform_num_groups"] = deform_num_groups
|
687 |
+
else:
|
688 |
+
stage_kargs["block_class"] = BottleneckBlock
|
689 |
+
blocks = ResNet.make_stage(**stage_kargs)
|
690 |
+
in_channels = out_channels
|
691 |
+
out_channels *= 2
|
692 |
+
bottleneck_channels *= 2
|
693 |
+
stages.append(blocks)
|
694 |
+
return ResNet(stem, stages, out_features=out_features, freeze_at=freeze_at)
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/swin.py
ADDED
@@ -0,0 +1,695 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
2 |
+
"""
|
3 |
+
Implementation of Swin models from :paper:`swin`.
|
4 |
+
|
5 |
+
This code is adapted from https://github.com/SwinTransformer/Swin-Transformer-Object-Detection/blob/master/mmdet/models/backbones/swin_transformer.py with minimal modifications. # noqa
|
6 |
+
--------------------------------------------------------
|
7 |
+
Swin Transformer
|
8 |
+
Copyright (c) 2021 Microsoft
|
9 |
+
Licensed under The MIT License [see LICENSE for details]
|
10 |
+
Written by Ze Liu, Yutong Lin, Yixuan Wei
|
11 |
+
--------------------------------------------------------
|
12 |
+
LICENSE: https://github.com/SwinTransformer/Swin-Transformer-Object-Detection/blob/461e003166a8083d0b620beacd4662a2df306bd6/LICENSE
|
13 |
+
"""
|
14 |
+
|
15 |
+
import numpy as np
|
16 |
+
import torch
|
17 |
+
import torch.nn as nn
|
18 |
+
import torch.nn.functional as F
|
19 |
+
import torch.utils.checkpoint as checkpoint
|
20 |
+
|
21 |
+
from annotator.oneformer.detectron2.modeling.backbone.backbone import Backbone
|
22 |
+
|
23 |
+
_to_2tuple = nn.modules.utils._ntuple(2)
|
24 |
+
|
25 |
+
|
26 |
+
class Mlp(nn.Module):
|
27 |
+
"""Multilayer perceptron."""
|
28 |
+
|
29 |
+
def __init__(
|
30 |
+
self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0
|
31 |
+
):
|
32 |
+
super().__init__()
|
33 |
+
out_features = out_features or in_features
|
34 |
+
hidden_features = hidden_features or in_features
|
35 |
+
self.fc1 = nn.Linear(in_features, hidden_features)
|
36 |
+
self.act = act_layer()
|
37 |
+
self.fc2 = nn.Linear(hidden_features, out_features)
|
38 |
+
self.drop = nn.Dropout(drop)
|
39 |
+
|
40 |
+
def forward(self, x):
|
41 |
+
x = self.fc1(x)
|
42 |
+
x = self.act(x)
|
43 |
+
x = self.drop(x)
|
44 |
+
x = self.fc2(x)
|
45 |
+
x = self.drop(x)
|
46 |
+
return x
|
47 |
+
|
48 |
+
|
49 |
+
def window_partition(x, window_size):
|
50 |
+
"""
|
51 |
+
Args:
|
52 |
+
x: (B, H, W, C)
|
53 |
+
window_size (int): window size
|
54 |
+
Returns:
|
55 |
+
windows: (num_windows*B, window_size, window_size, C)
|
56 |
+
"""
|
57 |
+
B, H, W, C = x.shape
|
58 |
+
x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
|
59 |
+
windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
|
60 |
+
return windows
|
61 |
+
|
62 |
+
|
63 |
+
def window_reverse(windows, window_size, H, W):
|
64 |
+
"""
|
65 |
+
Args:
|
66 |
+
windows: (num_windows*B, window_size, window_size, C)
|
67 |
+
window_size (int): Window size
|
68 |
+
H (int): Height of image
|
69 |
+
W (int): Width of image
|
70 |
+
Returns:
|
71 |
+
x: (B, H, W, C)
|
72 |
+
"""
|
73 |
+
B = int(windows.shape[0] / (H * W / window_size / window_size))
|
74 |
+
x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
|
75 |
+
x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
|
76 |
+
return x
|
77 |
+
|
78 |
+
|
79 |
+
class WindowAttention(nn.Module):
|
80 |
+
"""Window based multi-head self attention (W-MSA) module with relative position bias.
|
81 |
+
It supports both of shifted and non-shifted window.
|
82 |
+
Args:
|
83 |
+
dim (int): Number of input channels.
|
84 |
+
window_size (tuple[int]): The height and width of the window.
|
85 |
+
num_heads (int): Number of attention heads.
|
86 |
+
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value.
|
87 |
+
Default: True
|
88 |
+
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
|
89 |
+
attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
|
90 |
+
proj_drop (float, optional): Dropout ratio of output. Default: 0.0
|
91 |
+
"""
|
92 |
+
|
93 |
+
def __init__(
|
94 |
+
self,
|
95 |
+
dim,
|
96 |
+
window_size,
|
97 |
+
num_heads,
|
98 |
+
qkv_bias=True,
|
99 |
+
qk_scale=None,
|
100 |
+
attn_drop=0.0,
|
101 |
+
proj_drop=0.0,
|
102 |
+
):
|
103 |
+
|
104 |
+
super().__init__()
|
105 |
+
self.dim = dim
|
106 |
+
self.window_size = window_size # Wh, Ww
|
107 |
+
self.num_heads = num_heads
|
108 |
+
head_dim = dim // num_heads
|
109 |
+
self.scale = qk_scale or head_dim**-0.5
|
110 |
+
|
111 |
+
# define a parameter table of relative position bias
|
112 |
+
self.relative_position_bias_table = nn.Parameter(
|
113 |
+
torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)
|
114 |
+
) # 2*Wh-1 * 2*Ww-1, nH
|
115 |
+
|
116 |
+
# get pair-wise relative position index for each token inside the window
|
117 |
+
coords_h = torch.arange(self.window_size[0])
|
118 |
+
coords_w = torch.arange(self.window_size[1])
|
119 |
+
coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww
|
120 |
+
coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
|
121 |
+
relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww
|
122 |
+
relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2
|
123 |
+
relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0
|
124 |
+
relative_coords[:, :, 1] += self.window_size[1] - 1
|
125 |
+
relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
|
126 |
+
relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
|
127 |
+
self.register_buffer("relative_position_index", relative_position_index)
|
128 |
+
|
129 |
+
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
|
130 |
+
self.attn_drop = nn.Dropout(attn_drop)
|
131 |
+
self.proj = nn.Linear(dim, dim)
|
132 |
+
self.proj_drop = nn.Dropout(proj_drop)
|
133 |
+
|
134 |
+
nn.init.trunc_normal_(self.relative_position_bias_table, std=0.02)
|
135 |
+
self.softmax = nn.Softmax(dim=-1)
|
136 |
+
|
137 |
+
def forward(self, x, mask=None):
|
138 |
+
"""Forward function.
|
139 |
+
Args:
|
140 |
+
x: input features with shape of (num_windows*B, N, C)
|
141 |
+
mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
|
142 |
+
"""
|
143 |
+
B_, N, C = x.shape
|
144 |
+
qkv = (
|
145 |
+
self.qkv(x)
|
146 |
+
.reshape(B_, N, 3, self.num_heads, C // self.num_heads)
|
147 |
+
.permute(2, 0, 3, 1, 4)
|
148 |
+
)
|
149 |
+
q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
|
150 |
+
|
151 |
+
q = q * self.scale
|
152 |
+
attn = q @ k.transpose(-2, -1)
|
153 |
+
|
154 |
+
relative_position_bias = self.relative_position_bias_table[
|
155 |
+
self.relative_position_index.view(-1)
|
156 |
+
].view(
|
157 |
+
self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1
|
158 |
+
) # Wh*Ww,Wh*Ww,nH
|
159 |
+
relative_position_bias = relative_position_bias.permute(
|
160 |
+
2, 0, 1
|
161 |
+
).contiguous() # nH, Wh*Ww, Wh*Ww
|
162 |
+
attn = attn + relative_position_bias.unsqueeze(0)
|
163 |
+
|
164 |
+
if mask is not None:
|
165 |
+
nW = mask.shape[0]
|
166 |
+
attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
|
167 |
+
attn = attn.view(-1, self.num_heads, N, N)
|
168 |
+
attn = self.softmax(attn)
|
169 |
+
else:
|
170 |
+
attn = self.softmax(attn)
|
171 |
+
|
172 |
+
attn = self.attn_drop(attn)
|
173 |
+
|
174 |
+
x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
|
175 |
+
x = self.proj(x)
|
176 |
+
x = self.proj_drop(x)
|
177 |
+
return x
|
178 |
+
|
179 |
+
|
180 |
+
class SwinTransformerBlock(nn.Module):
|
181 |
+
"""Swin Transformer Block.
|
182 |
+
Args:
|
183 |
+
dim (int): Number of input channels.
|
184 |
+
num_heads (int): Number of attention heads.
|
185 |
+
window_size (int): Window size.
|
186 |
+
shift_size (int): Shift size for SW-MSA.
|
187 |
+
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
|
188 |
+
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
|
189 |
+
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
|
190 |
+
drop (float, optional): Dropout rate. Default: 0.0
|
191 |
+
attn_drop (float, optional): Attention dropout rate. Default: 0.0
|
192 |
+
drop_path (float, optional): Stochastic depth rate. Default: 0.0
|
193 |
+
act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
|
194 |
+
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
|
195 |
+
"""
|
196 |
+
|
197 |
+
def __init__(
|
198 |
+
self,
|
199 |
+
dim,
|
200 |
+
num_heads,
|
201 |
+
window_size=7,
|
202 |
+
shift_size=0,
|
203 |
+
mlp_ratio=4.0,
|
204 |
+
qkv_bias=True,
|
205 |
+
qk_scale=None,
|
206 |
+
drop=0.0,
|
207 |
+
attn_drop=0.0,
|
208 |
+
drop_path=0.0,
|
209 |
+
act_layer=nn.GELU,
|
210 |
+
norm_layer=nn.LayerNorm,
|
211 |
+
):
|
212 |
+
super().__init__()
|
213 |
+
self.dim = dim
|
214 |
+
self.num_heads = num_heads
|
215 |
+
self.window_size = window_size
|
216 |
+
self.shift_size = shift_size
|
217 |
+
self.mlp_ratio = mlp_ratio
|
218 |
+
assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
|
219 |
+
|
220 |
+
self.norm1 = norm_layer(dim)
|
221 |
+
self.attn = WindowAttention(
|
222 |
+
dim,
|
223 |
+
window_size=_to_2tuple(self.window_size),
|
224 |
+
num_heads=num_heads,
|
225 |
+
qkv_bias=qkv_bias,
|
226 |
+
qk_scale=qk_scale,
|
227 |
+
attn_drop=attn_drop,
|
228 |
+
proj_drop=drop,
|
229 |
+
)
|
230 |
+
|
231 |
+
if drop_path > 0.0:
|
232 |
+
from timm.models.layers import DropPath
|
233 |
+
|
234 |
+
self.drop_path = DropPath(drop_path)
|
235 |
+
else:
|
236 |
+
self.drop_path = nn.Identity()
|
237 |
+
self.norm2 = norm_layer(dim)
|
238 |
+
mlp_hidden_dim = int(dim * mlp_ratio)
|
239 |
+
self.mlp = Mlp(
|
240 |
+
in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop
|
241 |
+
)
|
242 |
+
|
243 |
+
self.H = None
|
244 |
+
self.W = None
|
245 |
+
|
246 |
+
def forward(self, x, mask_matrix):
|
247 |
+
"""Forward function.
|
248 |
+
Args:
|
249 |
+
x: Input feature, tensor size (B, H*W, C).
|
250 |
+
H, W: Spatial resolution of the input feature.
|
251 |
+
mask_matrix: Attention mask for cyclic shift.
|
252 |
+
"""
|
253 |
+
B, L, C = x.shape
|
254 |
+
H, W = self.H, self.W
|
255 |
+
assert L == H * W, "input feature has wrong size"
|
256 |
+
|
257 |
+
shortcut = x
|
258 |
+
x = self.norm1(x)
|
259 |
+
x = x.view(B, H, W, C)
|
260 |
+
|
261 |
+
# pad feature maps to multiples of window size
|
262 |
+
pad_l = pad_t = 0
|
263 |
+
pad_r = (self.window_size - W % self.window_size) % self.window_size
|
264 |
+
pad_b = (self.window_size - H % self.window_size) % self.window_size
|
265 |
+
x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
|
266 |
+
_, Hp, Wp, _ = x.shape
|
267 |
+
|
268 |
+
# cyclic shift
|
269 |
+
if self.shift_size > 0:
|
270 |
+
shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
|
271 |
+
attn_mask = mask_matrix
|
272 |
+
else:
|
273 |
+
shifted_x = x
|
274 |
+
attn_mask = None
|
275 |
+
|
276 |
+
# partition windows
|
277 |
+
x_windows = window_partition(
|
278 |
+
shifted_x, self.window_size
|
279 |
+
) # nW*B, window_size, window_size, C
|
280 |
+
x_windows = x_windows.view(
|
281 |
+
-1, self.window_size * self.window_size, C
|
282 |
+
) # nW*B, window_size*window_size, C
|
283 |
+
|
284 |
+
# W-MSA/SW-MSA
|
285 |
+
attn_windows = self.attn(x_windows, mask=attn_mask) # nW*B, window_size*window_size, C
|
286 |
+
|
287 |
+
# merge windows
|
288 |
+
attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
|
289 |
+
shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C
|
290 |
+
|
291 |
+
# reverse cyclic shift
|
292 |
+
if self.shift_size > 0:
|
293 |
+
x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
|
294 |
+
else:
|
295 |
+
x = shifted_x
|
296 |
+
|
297 |
+
if pad_r > 0 or pad_b > 0:
|
298 |
+
x = x[:, :H, :W, :].contiguous()
|
299 |
+
|
300 |
+
x = x.view(B, H * W, C)
|
301 |
+
|
302 |
+
# FFN
|
303 |
+
x = shortcut + self.drop_path(x)
|
304 |
+
x = x + self.drop_path(self.mlp(self.norm2(x)))
|
305 |
+
|
306 |
+
return x
|
307 |
+
|
308 |
+
|
309 |
+
class PatchMerging(nn.Module):
|
310 |
+
"""Patch Merging Layer
|
311 |
+
Args:
|
312 |
+
dim (int): Number of input channels.
|
313 |
+
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
|
314 |
+
"""
|
315 |
+
|
316 |
+
def __init__(self, dim, norm_layer=nn.LayerNorm):
|
317 |
+
super().__init__()
|
318 |
+
self.dim = dim
|
319 |
+
self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
|
320 |
+
self.norm = norm_layer(4 * dim)
|
321 |
+
|
322 |
+
def forward(self, x, H, W):
|
323 |
+
"""Forward function.
|
324 |
+
Args:
|
325 |
+
x: Input feature, tensor size (B, H*W, C).
|
326 |
+
H, W: Spatial resolution of the input feature.
|
327 |
+
"""
|
328 |
+
B, L, C = x.shape
|
329 |
+
assert L == H * W, "input feature has wrong size"
|
330 |
+
|
331 |
+
x = x.view(B, H, W, C)
|
332 |
+
|
333 |
+
# padding
|
334 |
+
pad_input = (H % 2 == 1) or (W % 2 == 1)
|
335 |
+
if pad_input:
|
336 |
+
x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
|
337 |
+
|
338 |
+
x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C
|
339 |
+
x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C
|
340 |
+
x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C
|
341 |
+
x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C
|
342 |
+
x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C
|
343 |
+
x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C
|
344 |
+
|
345 |
+
x = self.norm(x)
|
346 |
+
x = self.reduction(x)
|
347 |
+
|
348 |
+
return x
|
349 |
+
|
350 |
+
|
351 |
+
class BasicLayer(nn.Module):
|
352 |
+
"""A basic Swin Transformer layer for one stage.
|
353 |
+
Args:
|
354 |
+
dim (int): Number of feature channels
|
355 |
+
depth (int): Depths of this stage.
|
356 |
+
num_heads (int): Number of attention head.
|
357 |
+
window_size (int): Local window size. Default: 7.
|
358 |
+
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
|
359 |
+
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
|
360 |
+
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
|
361 |
+
drop (float, optional): Dropout rate. Default: 0.0
|
362 |
+
attn_drop (float, optional): Attention dropout rate. Default: 0.0
|
363 |
+
drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
|
364 |
+
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
|
365 |
+
downsample (nn.Module | None, optional): Downsample layer at the end of the layer.
|
366 |
+
Default: None
|
367 |
+
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
|
368 |
+
"""
|
369 |
+
|
370 |
+
def __init__(
|
371 |
+
self,
|
372 |
+
dim,
|
373 |
+
depth,
|
374 |
+
num_heads,
|
375 |
+
window_size=7,
|
376 |
+
mlp_ratio=4.0,
|
377 |
+
qkv_bias=True,
|
378 |
+
qk_scale=None,
|
379 |
+
drop=0.0,
|
380 |
+
attn_drop=0.0,
|
381 |
+
drop_path=0.0,
|
382 |
+
norm_layer=nn.LayerNorm,
|
383 |
+
downsample=None,
|
384 |
+
use_checkpoint=False,
|
385 |
+
):
|
386 |
+
super().__init__()
|
387 |
+
self.window_size = window_size
|
388 |
+
self.shift_size = window_size // 2
|
389 |
+
self.depth = depth
|
390 |
+
self.use_checkpoint = use_checkpoint
|
391 |
+
|
392 |
+
# build blocks
|
393 |
+
self.blocks = nn.ModuleList(
|
394 |
+
[
|
395 |
+
SwinTransformerBlock(
|
396 |
+
dim=dim,
|
397 |
+
num_heads=num_heads,
|
398 |
+
window_size=window_size,
|
399 |
+
shift_size=0 if (i % 2 == 0) else window_size // 2,
|
400 |
+
mlp_ratio=mlp_ratio,
|
401 |
+
qkv_bias=qkv_bias,
|
402 |
+
qk_scale=qk_scale,
|
403 |
+
drop=drop,
|
404 |
+
attn_drop=attn_drop,
|
405 |
+
drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
|
406 |
+
norm_layer=norm_layer,
|
407 |
+
)
|
408 |
+
for i in range(depth)
|
409 |
+
]
|
410 |
+
)
|
411 |
+
|
412 |
+
# patch merging layer
|
413 |
+
if downsample is not None:
|
414 |
+
self.downsample = downsample(dim=dim, norm_layer=norm_layer)
|
415 |
+
else:
|
416 |
+
self.downsample = None
|
417 |
+
|
418 |
+
def forward(self, x, H, W):
|
419 |
+
"""Forward function.
|
420 |
+
Args:
|
421 |
+
x: Input feature, tensor size (B, H*W, C).
|
422 |
+
H, W: Spatial resolution of the input feature.
|
423 |
+
"""
|
424 |
+
|
425 |
+
# calculate attention mask for SW-MSA
|
426 |
+
Hp = int(np.ceil(H / self.window_size)) * self.window_size
|
427 |
+
Wp = int(np.ceil(W / self.window_size)) * self.window_size
|
428 |
+
img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device) # 1 Hp Wp 1
|
429 |
+
h_slices = (
|
430 |
+
slice(0, -self.window_size),
|
431 |
+
slice(-self.window_size, -self.shift_size),
|
432 |
+
slice(-self.shift_size, None),
|
433 |
+
)
|
434 |
+
w_slices = (
|
435 |
+
slice(0, -self.window_size),
|
436 |
+
slice(-self.window_size, -self.shift_size),
|
437 |
+
slice(-self.shift_size, None),
|
438 |
+
)
|
439 |
+
cnt = 0
|
440 |
+
for h in h_slices:
|
441 |
+
for w in w_slices:
|
442 |
+
img_mask[:, h, w, :] = cnt
|
443 |
+
cnt += 1
|
444 |
+
|
445 |
+
mask_windows = window_partition(
|
446 |
+
img_mask, self.window_size
|
447 |
+
) # nW, window_size, window_size, 1
|
448 |
+
mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
|
449 |
+
attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
|
450 |
+
attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(
|
451 |
+
attn_mask == 0, float(0.0)
|
452 |
+
)
|
453 |
+
|
454 |
+
for blk in self.blocks:
|
455 |
+
blk.H, blk.W = H, W
|
456 |
+
if self.use_checkpoint:
|
457 |
+
x = checkpoint.checkpoint(blk, x, attn_mask)
|
458 |
+
else:
|
459 |
+
x = blk(x, attn_mask)
|
460 |
+
if self.downsample is not None:
|
461 |
+
x_down = self.downsample(x, H, W)
|
462 |
+
Wh, Ww = (H + 1) // 2, (W + 1) // 2
|
463 |
+
return x, H, W, x_down, Wh, Ww
|
464 |
+
else:
|
465 |
+
return x, H, W, x, H, W
|
466 |
+
|
467 |
+
|
468 |
+
class PatchEmbed(nn.Module):
|
469 |
+
"""Image to Patch Embedding
|
470 |
+
Args:
|
471 |
+
patch_size (int): Patch token size. Default: 4.
|
472 |
+
in_chans (int): Number of input image channels. Default: 3.
|
473 |
+
embed_dim (int): Number of linear projection output channels. Default: 96.
|
474 |
+
norm_layer (nn.Module, optional): Normalization layer. Default: None
|
475 |
+
"""
|
476 |
+
|
477 |
+
def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
|
478 |
+
super().__init__()
|
479 |
+
patch_size = _to_2tuple(patch_size)
|
480 |
+
self.patch_size = patch_size
|
481 |
+
|
482 |
+
self.in_chans = in_chans
|
483 |
+
self.embed_dim = embed_dim
|
484 |
+
|
485 |
+
self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
|
486 |
+
if norm_layer is not None:
|
487 |
+
self.norm = norm_layer(embed_dim)
|
488 |
+
else:
|
489 |
+
self.norm = None
|
490 |
+
|
491 |
+
def forward(self, x):
|
492 |
+
"""Forward function."""
|
493 |
+
# padding
|
494 |
+
_, _, H, W = x.size()
|
495 |
+
if W % self.patch_size[1] != 0:
|
496 |
+
x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
|
497 |
+
if H % self.patch_size[0] != 0:
|
498 |
+
x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
|
499 |
+
|
500 |
+
x = self.proj(x) # B C Wh Ww
|
501 |
+
if self.norm is not None:
|
502 |
+
Wh, Ww = x.size(2), x.size(3)
|
503 |
+
x = x.flatten(2).transpose(1, 2)
|
504 |
+
x = self.norm(x)
|
505 |
+
x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
|
506 |
+
|
507 |
+
return x
|
508 |
+
|
509 |
+
|
510 |
+
class SwinTransformer(Backbone):
|
511 |
+
"""Swin Transformer backbone.
|
512 |
+
A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted
|
513 |
+
Windows` - https://arxiv.org/pdf/2103.14030
|
514 |
+
Args:
|
515 |
+
pretrain_img_size (int): Input image size for training the pretrained model,
|
516 |
+
used in absolute postion embedding. Default 224.
|
517 |
+
patch_size (int | tuple(int)): Patch size. Default: 4.
|
518 |
+
in_chans (int): Number of input image channels. Default: 3.
|
519 |
+
embed_dim (int): Number of linear projection output channels. Default: 96.
|
520 |
+
depths (tuple[int]): Depths of each Swin Transformer stage.
|
521 |
+
num_heads (tuple[int]): Number of attention head of each stage.
|
522 |
+
window_size (int): Window size. Default: 7.
|
523 |
+
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
|
524 |
+
qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
|
525 |
+
qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
|
526 |
+
drop_rate (float): Dropout rate.
|
527 |
+
attn_drop_rate (float): Attention dropout rate. Default: 0.
|
528 |
+
drop_path_rate (float): Stochastic depth rate. Default: 0.2.
|
529 |
+
norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
|
530 |
+
ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
|
531 |
+
patch_norm (bool): If True, add normalization after patch embedding. Default: True.
|
532 |
+
out_indices (Sequence[int]): Output from which stages.
|
533 |
+
frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
|
534 |
+
-1 means not freezing any parameters.
|
535 |
+
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
|
536 |
+
"""
|
537 |
+
|
538 |
+
def __init__(
|
539 |
+
self,
|
540 |
+
pretrain_img_size=224,
|
541 |
+
patch_size=4,
|
542 |
+
in_chans=3,
|
543 |
+
embed_dim=96,
|
544 |
+
depths=(2, 2, 6, 2),
|
545 |
+
num_heads=(3, 6, 12, 24),
|
546 |
+
window_size=7,
|
547 |
+
mlp_ratio=4.0,
|
548 |
+
qkv_bias=True,
|
549 |
+
qk_scale=None,
|
550 |
+
drop_rate=0.0,
|
551 |
+
attn_drop_rate=0.0,
|
552 |
+
drop_path_rate=0.2,
|
553 |
+
norm_layer=nn.LayerNorm,
|
554 |
+
ape=False,
|
555 |
+
patch_norm=True,
|
556 |
+
out_indices=(0, 1, 2, 3),
|
557 |
+
frozen_stages=-1,
|
558 |
+
use_checkpoint=False,
|
559 |
+
):
|
560 |
+
super().__init__()
|
561 |
+
|
562 |
+
self.pretrain_img_size = pretrain_img_size
|
563 |
+
self.num_layers = len(depths)
|
564 |
+
self.embed_dim = embed_dim
|
565 |
+
self.ape = ape
|
566 |
+
self.patch_norm = patch_norm
|
567 |
+
self.out_indices = out_indices
|
568 |
+
self.frozen_stages = frozen_stages
|
569 |
+
|
570 |
+
# split image into non-overlapping patches
|
571 |
+
self.patch_embed = PatchEmbed(
|
572 |
+
patch_size=patch_size,
|
573 |
+
in_chans=in_chans,
|
574 |
+
embed_dim=embed_dim,
|
575 |
+
norm_layer=norm_layer if self.patch_norm else None,
|
576 |
+
)
|
577 |
+
|
578 |
+
# absolute position embedding
|
579 |
+
if self.ape:
|
580 |
+
pretrain_img_size = _to_2tuple(pretrain_img_size)
|
581 |
+
patch_size = _to_2tuple(patch_size)
|
582 |
+
patches_resolution = [
|
583 |
+
pretrain_img_size[0] // patch_size[0],
|
584 |
+
pretrain_img_size[1] // patch_size[1],
|
585 |
+
]
|
586 |
+
|
587 |
+
self.absolute_pos_embed = nn.Parameter(
|
588 |
+
torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1])
|
589 |
+
)
|
590 |
+
nn.init.trunc_normal_(self.absolute_pos_embed, std=0.02)
|
591 |
+
|
592 |
+
self.pos_drop = nn.Dropout(p=drop_rate)
|
593 |
+
|
594 |
+
# stochastic depth
|
595 |
+
dpr = [
|
596 |
+
x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
|
597 |
+
] # stochastic depth decay rule
|
598 |
+
|
599 |
+
# build layers
|
600 |
+
self.layers = nn.ModuleList()
|
601 |
+
for i_layer in range(self.num_layers):
|
602 |
+
layer = BasicLayer(
|
603 |
+
dim=int(embed_dim * 2**i_layer),
|
604 |
+
depth=depths[i_layer],
|
605 |
+
num_heads=num_heads[i_layer],
|
606 |
+
window_size=window_size,
|
607 |
+
mlp_ratio=mlp_ratio,
|
608 |
+
qkv_bias=qkv_bias,
|
609 |
+
qk_scale=qk_scale,
|
610 |
+
drop=drop_rate,
|
611 |
+
attn_drop=attn_drop_rate,
|
612 |
+
drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])],
|
613 |
+
norm_layer=norm_layer,
|
614 |
+
downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
|
615 |
+
use_checkpoint=use_checkpoint,
|
616 |
+
)
|
617 |
+
self.layers.append(layer)
|
618 |
+
|
619 |
+
num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
|
620 |
+
self.num_features = num_features
|
621 |
+
|
622 |
+
# add a norm layer for each output
|
623 |
+
for i_layer in out_indices:
|
624 |
+
layer = norm_layer(num_features[i_layer])
|
625 |
+
layer_name = f"norm{i_layer}"
|
626 |
+
self.add_module(layer_name, layer)
|
627 |
+
|
628 |
+
self._freeze_stages()
|
629 |
+
self._out_features = ["p{}".format(i) for i in self.out_indices]
|
630 |
+
self._out_feature_channels = {
|
631 |
+
"p{}".format(i): self.embed_dim * 2**i for i in self.out_indices
|
632 |
+
}
|
633 |
+
self._out_feature_strides = {"p{}".format(i): 2 ** (i + 2) for i in self.out_indices}
|
634 |
+
self._size_devisibility = 32
|
635 |
+
|
636 |
+
self.apply(self._init_weights)
|
637 |
+
|
638 |
+
def _freeze_stages(self):
|
639 |
+
if self.frozen_stages >= 0:
|
640 |
+
self.patch_embed.eval()
|
641 |
+
for param in self.patch_embed.parameters():
|
642 |
+
param.requires_grad = False
|
643 |
+
|
644 |
+
if self.frozen_stages >= 1 and self.ape:
|
645 |
+
self.absolute_pos_embed.requires_grad = False
|
646 |
+
|
647 |
+
if self.frozen_stages >= 2:
|
648 |
+
self.pos_drop.eval()
|
649 |
+
for i in range(0, self.frozen_stages - 1):
|
650 |
+
m = self.layers[i]
|
651 |
+
m.eval()
|
652 |
+
for param in m.parameters():
|
653 |
+
param.requires_grad = False
|
654 |
+
|
655 |
+
def _init_weights(self, m):
|
656 |
+
if isinstance(m, nn.Linear):
|
657 |
+
nn.init.trunc_normal_(m.weight, std=0.02)
|
658 |
+
if isinstance(m, nn.Linear) and m.bias is not None:
|
659 |
+
nn.init.constant_(m.bias, 0)
|
660 |
+
elif isinstance(m, nn.LayerNorm):
|
661 |
+
nn.init.constant_(m.bias, 0)
|
662 |
+
nn.init.constant_(m.weight, 1.0)
|
663 |
+
|
664 |
+
@property
|
665 |
+
def size_divisibility(self):
|
666 |
+
return self._size_divisibility
|
667 |
+
|
668 |
+
def forward(self, x):
|
669 |
+
"""Forward function."""
|
670 |
+
x = self.patch_embed(x)
|
671 |
+
|
672 |
+
Wh, Ww = x.size(2), x.size(3)
|
673 |
+
if self.ape:
|
674 |
+
# interpolate the position embedding to the corresponding size
|
675 |
+
absolute_pos_embed = F.interpolate(
|
676 |
+
self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic"
|
677 |
+
)
|
678 |
+
x = (x + absolute_pos_embed).flatten(2).transpose(1, 2) # B Wh*Ww C
|
679 |
+
else:
|
680 |
+
x = x.flatten(2).transpose(1, 2)
|
681 |
+
x = self.pos_drop(x)
|
682 |
+
|
683 |
+
outs = {}
|
684 |
+
for i in range(self.num_layers):
|
685 |
+
layer = self.layers[i]
|
686 |
+
x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
|
687 |
+
|
688 |
+
if i in self.out_indices:
|
689 |
+
norm_layer = getattr(self, f"norm{i}")
|
690 |
+
x_out = norm_layer(x_out)
|
691 |
+
|
692 |
+
out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
|
693 |
+
outs["p{}".format(i)] = out
|
694 |
+
|
695 |
+
return outs
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/utils.py
ADDED
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
2 |
+
import math
|
3 |
+
import torch
|
4 |
+
import torch.nn as nn
|
5 |
+
import torch.nn.functional as F
|
6 |
+
|
7 |
+
__all__ = [
|
8 |
+
"window_partition",
|
9 |
+
"window_unpartition",
|
10 |
+
"add_decomposed_rel_pos",
|
11 |
+
"get_abs_pos",
|
12 |
+
"PatchEmbed",
|
13 |
+
]
|
14 |
+
|
15 |
+
|
16 |
+
def window_partition(x, window_size):
|
17 |
+
"""
|
18 |
+
Partition into non-overlapping windows with padding if needed.
|
19 |
+
Args:
|
20 |
+
x (tensor): input tokens with [B, H, W, C].
|
21 |
+
window_size (int): window size.
|
22 |
+
|
23 |
+
Returns:
|
24 |
+
windows: windows after partition with [B * num_windows, window_size, window_size, C].
|
25 |
+
(Hp, Wp): padded height and width before partition
|
26 |
+
"""
|
27 |
+
B, H, W, C = x.shape
|
28 |
+
|
29 |
+
pad_h = (window_size - H % window_size) % window_size
|
30 |
+
pad_w = (window_size - W % window_size) % window_size
|
31 |
+
if pad_h > 0 or pad_w > 0:
|
32 |
+
x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
|
33 |
+
Hp, Wp = H + pad_h, W + pad_w
|
34 |
+
|
35 |
+
x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
|
36 |
+
windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
|
37 |
+
return windows, (Hp, Wp)
|
38 |
+
|
39 |
+
|
40 |
+
def window_unpartition(windows, window_size, pad_hw, hw):
|
41 |
+
"""
|
42 |
+
Window unpartition into original sequences and removing padding.
|
43 |
+
Args:
|
44 |
+
x (tensor): input tokens with [B * num_windows, window_size, window_size, C].
|
45 |
+
window_size (int): window size.
|
46 |
+
pad_hw (Tuple): padded height and width (Hp, Wp).
|
47 |
+
hw (Tuple): original height and width (H, W) before padding.
|
48 |
+
|
49 |
+
Returns:
|
50 |
+
x: unpartitioned sequences with [B, H, W, C].
|
51 |
+
"""
|
52 |
+
Hp, Wp = pad_hw
|
53 |
+
H, W = hw
|
54 |
+
B = windows.shape[0] // (Hp * Wp // window_size // window_size)
|
55 |
+
x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1)
|
56 |
+
x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
|
57 |
+
|
58 |
+
if Hp > H or Wp > W:
|
59 |
+
x = x[:, :H, :W, :].contiguous()
|
60 |
+
return x
|
61 |
+
|
62 |
+
|
63 |
+
def get_rel_pos(q_size, k_size, rel_pos):
|
64 |
+
"""
|
65 |
+
Get relative positional embeddings according to the relative positions of
|
66 |
+
query and key sizes.
|
67 |
+
Args:
|
68 |
+
q_size (int): size of query q.
|
69 |
+
k_size (int): size of key k.
|
70 |
+
rel_pos (Tensor): relative position embeddings (L, C).
|
71 |
+
|
72 |
+
Returns:
|
73 |
+
Extracted positional embeddings according to relative positions.
|
74 |
+
"""
|
75 |
+
max_rel_dist = int(2 * max(q_size, k_size) - 1)
|
76 |
+
# Interpolate rel pos if needed.
|
77 |
+
if rel_pos.shape[0] != max_rel_dist:
|
78 |
+
# Interpolate rel pos.
|
79 |
+
rel_pos_resized = F.interpolate(
|
80 |
+
rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
|
81 |
+
size=max_rel_dist,
|
82 |
+
mode="linear",
|
83 |
+
)
|
84 |
+
rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
|
85 |
+
else:
|
86 |
+
rel_pos_resized = rel_pos
|
87 |
+
|
88 |
+
# Scale the coords with short length if shapes for q and k are different.
|
89 |
+
q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
|
90 |
+
k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
|
91 |
+
relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
|
92 |
+
|
93 |
+
return rel_pos_resized[relative_coords.long()]
|
94 |
+
|
95 |
+
|
96 |
+
def add_decomposed_rel_pos(attn, q, rel_pos_h, rel_pos_w, q_size, k_size):
|
97 |
+
"""
|
98 |
+
Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
|
99 |
+
https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py # noqa B950
|
100 |
+
Args:
|
101 |
+
attn (Tensor): attention map.
|
102 |
+
q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
|
103 |
+
rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
|
104 |
+
rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
|
105 |
+
q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
|
106 |
+
k_size (Tuple): spatial sequence size of key k with (k_h, k_w).
|
107 |
+
|
108 |
+
Returns:
|
109 |
+
attn (Tensor): attention map with added relative positional embeddings.
|
110 |
+
"""
|
111 |
+
q_h, q_w = q_size
|
112 |
+
k_h, k_w = k_size
|
113 |
+
Rh = get_rel_pos(q_h, k_h, rel_pos_h)
|
114 |
+
Rw = get_rel_pos(q_w, k_w, rel_pos_w)
|
115 |
+
|
116 |
+
B, _, dim = q.shape
|
117 |
+
r_q = q.reshape(B, q_h, q_w, dim)
|
118 |
+
rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
|
119 |
+
rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
|
120 |
+
|
121 |
+
attn = (
|
122 |
+
attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
|
123 |
+
).view(B, q_h * q_w, k_h * k_w)
|
124 |
+
|
125 |
+
return attn
|
126 |
+
|
127 |
+
|
128 |
+
def get_abs_pos(abs_pos, has_cls_token, hw):
|
129 |
+
"""
|
130 |
+
Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token
|
131 |
+
dimension for the original embeddings.
|
132 |
+
Args:
|
133 |
+
abs_pos (Tensor): absolute positional embeddings with (1, num_position, C).
|
134 |
+
has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token.
|
135 |
+
hw (Tuple): size of input image tokens.
|
136 |
+
|
137 |
+
Returns:
|
138 |
+
Absolute positional embeddings after processing with shape (1, H, W, C)
|
139 |
+
"""
|
140 |
+
h, w = hw
|
141 |
+
if has_cls_token:
|
142 |
+
abs_pos = abs_pos[:, 1:]
|
143 |
+
xy_num = abs_pos.shape[1]
|
144 |
+
size = int(math.sqrt(xy_num))
|
145 |
+
assert size * size == xy_num
|
146 |
+
|
147 |
+
if size != h or size != w:
|
148 |
+
new_abs_pos = F.interpolate(
|
149 |
+
abs_pos.reshape(1, size, size, -1).permute(0, 3, 1, 2),
|
150 |
+
size=(h, w),
|
151 |
+
mode="bicubic",
|
152 |
+
align_corners=False,
|
153 |
+
)
|
154 |
+
|
155 |
+
return new_abs_pos.permute(0, 2, 3, 1)
|
156 |
+
else:
|
157 |
+
return abs_pos.reshape(1, h, w, -1)
|
158 |
+
|
159 |
+
|
160 |
+
class PatchEmbed(nn.Module):
|
161 |
+
"""
|
162 |
+
Image to Patch Embedding.
|
163 |
+
"""
|
164 |
+
|
165 |
+
def __init__(
|
166 |
+
self, kernel_size=(16, 16), stride=(16, 16), padding=(0, 0), in_chans=3, embed_dim=768
|
167 |
+
):
|
168 |
+
"""
|
169 |
+
Args:
|
170 |
+
kernel_size (Tuple): kernel size of the projection layer.
|
171 |
+
stride (Tuple): stride of the projection layer.
|
172 |
+
padding (Tuple): padding size of the projection layer.
|
173 |
+
in_chans (int): Number of input image channels.
|
174 |
+
embed_dim (int): embed_dim (int): Patch embedding dimension.
|
175 |
+
"""
|
176 |
+
super().__init__()
|
177 |
+
|
178 |
+
self.proj = nn.Conv2d(
|
179 |
+
in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
|
180 |
+
)
|
181 |
+
|
182 |
+
def forward(self, x):
|
183 |
+
x = self.proj(x)
|
184 |
+
# B C H W -> B H W C
|
185 |
+
x = x.permute(0, 2, 3, 1)
|
186 |
+
return x
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/vit.py
ADDED
@@ -0,0 +1,524 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import math
|
3 |
+
import fvcore.nn.weight_init as weight_init
|
4 |
+
import torch
|
5 |
+
import torch.nn as nn
|
6 |
+
|
7 |
+
from annotator.oneformer.detectron2.layers import CNNBlockBase, Conv2d, get_norm
|
8 |
+
from annotator.oneformer.detectron2.modeling.backbone.fpn import _assert_strides_are_log2_contiguous
|
9 |
+
|
10 |
+
from .backbone import Backbone
|
11 |
+
from .utils import (
|
12 |
+
PatchEmbed,
|
13 |
+
add_decomposed_rel_pos,
|
14 |
+
get_abs_pos,
|
15 |
+
window_partition,
|
16 |
+
window_unpartition,
|
17 |
+
)
|
18 |
+
|
19 |
+
logger = logging.getLogger(__name__)
|
20 |
+
|
21 |
+
|
22 |
+
__all__ = ["ViT", "SimpleFeaturePyramid", "get_vit_lr_decay_rate"]
|
23 |
+
|
24 |
+
|
25 |
+
class Attention(nn.Module):
|
26 |
+
"""Multi-head Attention block with relative position embeddings."""
|
27 |
+
|
28 |
+
def __init__(
|
29 |
+
self,
|
30 |
+
dim,
|
31 |
+
num_heads=8,
|
32 |
+
qkv_bias=True,
|
33 |
+
use_rel_pos=False,
|
34 |
+
rel_pos_zero_init=True,
|
35 |
+
input_size=None,
|
36 |
+
):
|
37 |
+
"""
|
38 |
+
Args:
|
39 |
+
dim (int): Number of input channels.
|
40 |
+
num_heads (int): Number of attention heads.
|
41 |
+
qkv_bias (bool: If True, add a learnable bias to query, key, value.
|
42 |
+
rel_pos (bool): If True, add relative positional embeddings to the attention map.
|
43 |
+
rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
|
44 |
+
input_size (int or None): Input resolution for calculating the relative positional
|
45 |
+
parameter size.
|
46 |
+
"""
|
47 |
+
super().__init__()
|
48 |
+
self.num_heads = num_heads
|
49 |
+
head_dim = dim // num_heads
|
50 |
+
self.scale = head_dim**-0.5
|
51 |
+
|
52 |
+
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
|
53 |
+
self.proj = nn.Linear(dim, dim)
|
54 |
+
|
55 |
+
self.use_rel_pos = use_rel_pos
|
56 |
+
if self.use_rel_pos:
|
57 |
+
# initialize relative positional embeddings
|
58 |
+
self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
|
59 |
+
self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
|
60 |
+
|
61 |
+
if not rel_pos_zero_init:
|
62 |
+
nn.init.trunc_normal_(self.rel_pos_h, std=0.02)
|
63 |
+
nn.init.trunc_normal_(self.rel_pos_w, std=0.02)
|
64 |
+
|
65 |
+
def forward(self, x):
|
66 |
+
B, H, W, _ = x.shape
|
67 |
+
# qkv with shape (3, B, nHead, H * W, C)
|
68 |
+
qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
|
69 |
+
# q, k, v with shape (B * nHead, H * W, C)
|
70 |
+
q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0)
|
71 |
+
|
72 |
+
attn = (q * self.scale) @ k.transpose(-2, -1)
|
73 |
+
|
74 |
+
if self.use_rel_pos:
|
75 |
+
attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W))
|
76 |
+
|
77 |
+
attn = attn.softmax(dim=-1)
|
78 |
+
x = (attn @ v).view(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1)
|
79 |
+
x = self.proj(x)
|
80 |
+
|
81 |
+
return x
|
82 |
+
|
83 |
+
|
84 |
+
class ResBottleneckBlock(CNNBlockBase):
|
85 |
+
"""
|
86 |
+
The standard bottleneck residual block without the last activation layer.
|
87 |
+
It contains 3 conv layers with kernels 1x1, 3x3, 1x1.
|
88 |
+
"""
|
89 |
+
|
90 |
+
def __init__(
|
91 |
+
self,
|
92 |
+
in_channels,
|
93 |
+
out_channels,
|
94 |
+
bottleneck_channels,
|
95 |
+
norm="LN",
|
96 |
+
act_layer=nn.GELU,
|
97 |
+
):
|
98 |
+
"""
|
99 |
+
Args:
|
100 |
+
in_channels (int): Number of input channels.
|
101 |
+
out_channels (int): Number of output channels.
|
102 |
+
bottleneck_channels (int): number of output channels for the 3x3
|
103 |
+
"bottleneck" conv layers.
|
104 |
+
norm (str or callable): normalization for all conv layers.
|
105 |
+
See :func:`layers.get_norm` for supported format.
|
106 |
+
act_layer (callable): activation for all conv layers.
|
107 |
+
"""
|
108 |
+
super().__init__(in_channels, out_channels, 1)
|
109 |
+
|
110 |
+
self.conv1 = Conv2d(in_channels, bottleneck_channels, 1, bias=False)
|
111 |
+
self.norm1 = get_norm(norm, bottleneck_channels)
|
112 |
+
self.act1 = act_layer()
|
113 |
+
|
114 |
+
self.conv2 = Conv2d(
|
115 |
+
bottleneck_channels,
|
116 |
+
bottleneck_channels,
|
117 |
+
3,
|
118 |
+
padding=1,
|
119 |
+
bias=False,
|
120 |
+
)
|
121 |
+
self.norm2 = get_norm(norm, bottleneck_channels)
|
122 |
+
self.act2 = act_layer()
|
123 |
+
|
124 |
+
self.conv3 = Conv2d(bottleneck_channels, out_channels, 1, bias=False)
|
125 |
+
self.norm3 = get_norm(norm, out_channels)
|
126 |
+
|
127 |
+
for layer in [self.conv1, self.conv2, self.conv3]:
|
128 |
+
weight_init.c2_msra_fill(layer)
|
129 |
+
for layer in [self.norm1, self.norm2]:
|
130 |
+
layer.weight.data.fill_(1.0)
|
131 |
+
layer.bias.data.zero_()
|
132 |
+
# zero init last norm layer.
|
133 |
+
self.norm3.weight.data.zero_()
|
134 |
+
self.norm3.bias.data.zero_()
|
135 |
+
|
136 |
+
def forward(self, x):
|
137 |
+
out = x
|
138 |
+
for layer in self.children():
|
139 |
+
out = layer(out)
|
140 |
+
|
141 |
+
out = x + out
|
142 |
+
return out
|
143 |
+
|
144 |
+
|
145 |
+
class Block(nn.Module):
|
146 |
+
"""Transformer blocks with support of window attention and residual propagation blocks"""
|
147 |
+
|
148 |
+
def __init__(
|
149 |
+
self,
|
150 |
+
dim,
|
151 |
+
num_heads,
|
152 |
+
mlp_ratio=4.0,
|
153 |
+
qkv_bias=True,
|
154 |
+
drop_path=0.0,
|
155 |
+
norm_layer=nn.LayerNorm,
|
156 |
+
act_layer=nn.GELU,
|
157 |
+
use_rel_pos=False,
|
158 |
+
rel_pos_zero_init=True,
|
159 |
+
window_size=0,
|
160 |
+
use_residual_block=False,
|
161 |
+
input_size=None,
|
162 |
+
):
|
163 |
+
"""
|
164 |
+
Args:
|
165 |
+
dim (int): Number of input channels.
|
166 |
+
num_heads (int): Number of attention heads in each ViT block.
|
167 |
+
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
|
168 |
+
qkv_bias (bool): If True, add a learnable bias to query, key, value.
|
169 |
+
drop_path (float): Stochastic depth rate.
|
170 |
+
norm_layer (nn.Module): Normalization layer.
|
171 |
+
act_layer (nn.Module): Activation layer.
|
172 |
+
use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
|
173 |
+
rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
|
174 |
+
window_size (int): Window size for window attention blocks. If it equals 0, then not
|
175 |
+
use window attention.
|
176 |
+
use_residual_block (bool): If True, use a residual block after the MLP block.
|
177 |
+
input_size (int or None): Input resolution for calculating the relative positional
|
178 |
+
parameter size.
|
179 |
+
"""
|
180 |
+
super().__init__()
|
181 |
+
self.norm1 = norm_layer(dim)
|
182 |
+
self.attn = Attention(
|
183 |
+
dim,
|
184 |
+
num_heads=num_heads,
|
185 |
+
qkv_bias=qkv_bias,
|
186 |
+
use_rel_pos=use_rel_pos,
|
187 |
+
rel_pos_zero_init=rel_pos_zero_init,
|
188 |
+
input_size=input_size if window_size == 0 else (window_size, window_size),
|
189 |
+
)
|
190 |
+
|
191 |
+
from timm.models.layers import DropPath, Mlp
|
192 |
+
|
193 |
+
self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
|
194 |
+
self.norm2 = norm_layer(dim)
|
195 |
+
self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer)
|
196 |
+
|
197 |
+
self.window_size = window_size
|
198 |
+
|
199 |
+
self.use_residual_block = use_residual_block
|
200 |
+
if use_residual_block:
|
201 |
+
# Use a residual block with bottleneck channel as dim // 2
|
202 |
+
self.residual = ResBottleneckBlock(
|
203 |
+
in_channels=dim,
|
204 |
+
out_channels=dim,
|
205 |
+
bottleneck_channels=dim // 2,
|
206 |
+
norm="LN",
|
207 |
+
act_layer=act_layer,
|
208 |
+
)
|
209 |
+
|
210 |
+
def forward(self, x):
|
211 |
+
shortcut = x
|
212 |
+
x = self.norm1(x)
|
213 |
+
# Window partition
|
214 |
+
if self.window_size > 0:
|
215 |
+
H, W = x.shape[1], x.shape[2]
|
216 |
+
x, pad_hw = window_partition(x, self.window_size)
|
217 |
+
|
218 |
+
x = self.attn(x)
|
219 |
+
# Reverse window partition
|
220 |
+
if self.window_size > 0:
|
221 |
+
x = window_unpartition(x, self.window_size, pad_hw, (H, W))
|
222 |
+
|
223 |
+
x = shortcut + self.drop_path(x)
|
224 |
+
x = x + self.drop_path(self.mlp(self.norm2(x)))
|
225 |
+
|
226 |
+
if self.use_residual_block:
|
227 |
+
x = self.residual(x.permute(0, 3, 1, 2)).permute(0, 2, 3, 1)
|
228 |
+
|
229 |
+
return x
|
230 |
+
|
231 |
+
|
232 |
+
class ViT(Backbone):
|
233 |
+
"""
|
234 |
+
This module implements Vision Transformer (ViT) backbone in :paper:`vitdet`.
|
235 |
+
"Exploring Plain Vision Transformer Backbones for Object Detection",
|
236 |
+
https://arxiv.org/abs/2203.16527
|
237 |
+
"""
|
238 |
+
|
239 |
+
def __init__(
|
240 |
+
self,
|
241 |
+
img_size=1024,
|
242 |
+
patch_size=16,
|
243 |
+
in_chans=3,
|
244 |
+
embed_dim=768,
|
245 |
+
depth=12,
|
246 |
+
num_heads=12,
|
247 |
+
mlp_ratio=4.0,
|
248 |
+
qkv_bias=True,
|
249 |
+
drop_path_rate=0.0,
|
250 |
+
norm_layer=nn.LayerNorm,
|
251 |
+
act_layer=nn.GELU,
|
252 |
+
use_abs_pos=True,
|
253 |
+
use_rel_pos=False,
|
254 |
+
rel_pos_zero_init=True,
|
255 |
+
window_size=0,
|
256 |
+
window_block_indexes=(),
|
257 |
+
residual_block_indexes=(),
|
258 |
+
use_act_checkpoint=False,
|
259 |
+
pretrain_img_size=224,
|
260 |
+
pretrain_use_cls_token=True,
|
261 |
+
out_feature="last_feat",
|
262 |
+
):
|
263 |
+
"""
|
264 |
+
Args:
|
265 |
+
img_size (int): Input image size.
|
266 |
+
patch_size (int): Patch size.
|
267 |
+
in_chans (int): Number of input image channels.
|
268 |
+
embed_dim (int): Patch embedding dimension.
|
269 |
+
depth (int): Depth of ViT.
|
270 |
+
num_heads (int): Number of attention heads in each ViT block.
|
271 |
+
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
|
272 |
+
qkv_bias (bool): If True, add a learnable bias to query, key, value.
|
273 |
+
drop_path_rate (float): Stochastic depth rate.
|
274 |
+
norm_layer (nn.Module): Normalization layer.
|
275 |
+
act_layer (nn.Module): Activation layer.
|
276 |
+
use_abs_pos (bool): If True, use absolute positional embeddings.
|
277 |
+
use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
|
278 |
+
rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
|
279 |
+
window_size (int): Window size for window attention blocks.
|
280 |
+
window_block_indexes (list): Indexes for blocks using window attention.
|
281 |
+
residual_block_indexes (list): Indexes for blocks using conv propagation.
|
282 |
+
use_act_checkpoint (bool): If True, use activation checkpointing.
|
283 |
+
pretrain_img_size (int): input image size for pretraining models.
|
284 |
+
pretrain_use_cls_token (bool): If True, pretrainig models use class token.
|
285 |
+
out_feature (str): name of the feature from the last block.
|
286 |
+
"""
|
287 |
+
super().__init__()
|
288 |
+
self.pretrain_use_cls_token = pretrain_use_cls_token
|
289 |
+
|
290 |
+
self.patch_embed = PatchEmbed(
|
291 |
+
kernel_size=(patch_size, patch_size),
|
292 |
+
stride=(patch_size, patch_size),
|
293 |
+
in_chans=in_chans,
|
294 |
+
embed_dim=embed_dim,
|
295 |
+
)
|
296 |
+
|
297 |
+
if use_abs_pos:
|
298 |
+
# Initialize absolute positional embedding with pretrain image size.
|
299 |
+
num_patches = (pretrain_img_size // patch_size) * (pretrain_img_size // patch_size)
|
300 |
+
num_positions = (num_patches + 1) if pretrain_use_cls_token else num_patches
|
301 |
+
self.pos_embed = nn.Parameter(torch.zeros(1, num_positions, embed_dim))
|
302 |
+
else:
|
303 |
+
self.pos_embed = None
|
304 |
+
|
305 |
+
# stochastic depth decay rule
|
306 |
+
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
|
307 |
+
|
308 |
+
self.blocks = nn.ModuleList()
|
309 |
+
for i in range(depth):
|
310 |
+
block = Block(
|
311 |
+
dim=embed_dim,
|
312 |
+
num_heads=num_heads,
|
313 |
+
mlp_ratio=mlp_ratio,
|
314 |
+
qkv_bias=qkv_bias,
|
315 |
+
drop_path=dpr[i],
|
316 |
+
norm_layer=norm_layer,
|
317 |
+
act_layer=act_layer,
|
318 |
+
use_rel_pos=use_rel_pos,
|
319 |
+
rel_pos_zero_init=rel_pos_zero_init,
|
320 |
+
window_size=window_size if i in window_block_indexes else 0,
|
321 |
+
use_residual_block=i in residual_block_indexes,
|
322 |
+
input_size=(img_size // patch_size, img_size // patch_size),
|
323 |
+
)
|
324 |
+
if use_act_checkpoint:
|
325 |
+
# TODO: use torch.utils.checkpoint
|
326 |
+
from fairscale.nn.checkpoint import checkpoint_wrapper
|
327 |
+
|
328 |
+
block = checkpoint_wrapper(block)
|
329 |
+
self.blocks.append(block)
|
330 |
+
|
331 |
+
self._out_feature_channels = {out_feature: embed_dim}
|
332 |
+
self._out_feature_strides = {out_feature: patch_size}
|
333 |
+
self._out_features = [out_feature]
|
334 |
+
|
335 |
+
if self.pos_embed is not None:
|
336 |
+
nn.init.trunc_normal_(self.pos_embed, std=0.02)
|
337 |
+
|
338 |
+
self.apply(self._init_weights)
|
339 |
+
|
340 |
+
def _init_weights(self, m):
|
341 |
+
if isinstance(m, nn.Linear):
|
342 |
+
nn.init.trunc_normal_(m.weight, std=0.02)
|
343 |
+
if isinstance(m, nn.Linear) and m.bias is not None:
|
344 |
+
nn.init.constant_(m.bias, 0)
|
345 |
+
elif isinstance(m, nn.LayerNorm):
|
346 |
+
nn.init.constant_(m.bias, 0)
|
347 |
+
nn.init.constant_(m.weight, 1.0)
|
348 |
+
|
349 |
+
def forward(self, x):
|
350 |
+
x = self.patch_embed(x)
|
351 |
+
if self.pos_embed is not None:
|
352 |
+
x = x + get_abs_pos(
|
353 |
+
self.pos_embed, self.pretrain_use_cls_token, (x.shape[1], x.shape[2])
|
354 |
+
)
|
355 |
+
|
356 |
+
for blk in self.blocks:
|
357 |
+
x = blk(x)
|
358 |
+
|
359 |
+
outputs = {self._out_features[0]: x.permute(0, 3, 1, 2)}
|
360 |
+
return outputs
|
361 |
+
|
362 |
+
|
363 |
+
class SimpleFeaturePyramid(Backbone):
|
364 |
+
"""
|
365 |
+
This module implements SimpleFeaturePyramid in :paper:`vitdet`.
|
366 |
+
It creates pyramid features built on top of the input feature map.
|
367 |
+
"""
|
368 |
+
|
369 |
+
def __init__(
|
370 |
+
self,
|
371 |
+
net,
|
372 |
+
in_feature,
|
373 |
+
out_channels,
|
374 |
+
scale_factors,
|
375 |
+
top_block=None,
|
376 |
+
norm="LN",
|
377 |
+
square_pad=0,
|
378 |
+
):
|
379 |
+
"""
|
380 |
+
Args:
|
381 |
+
net (Backbone): module representing the subnetwork backbone.
|
382 |
+
Must be a subclass of :class:`Backbone`.
|
383 |
+
in_feature (str): names of the input feature maps coming
|
384 |
+
from the net.
|
385 |
+
out_channels (int): number of channels in the output feature maps.
|
386 |
+
scale_factors (list[float]): list of scaling factors to upsample or downsample
|
387 |
+
the input features for creating pyramid features.
|
388 |
+
top_block (nn.Module or None): if provided, an extra operation will
|
389 |
+
be performed on the output of the last (smallest resolution)
|
390 |
+
pyramid output, and the result will extend the result list. The top_block
|
391 |
+
further downsamples the feature map. It must have an attribute
|
392 |
+
"num_levels", meaning the number of extra pyramid levels added by
|
393 |
+
this block, and "in_feature", which is a string representing
|
394 |
+
its input feature (e.g., p5).
|
395 |
+
norm (str): the normalization to use.
|
396 |
+
square_pad (int): If > 0, require input images to be padded to specific square size.
|
397 |
+
"""
|
398 |
+
super(SimpleFeaturePyramid, self).__init__()
|
399 |
+
assert isinstance(net, Backbone)
|
400 |
+
|
401 |
+
self.scale_factors = scale_factors
|
402 |
+
|
403 |
+
input_shapes = net.output_shape()
|
404 |
+
strides = [int(input_shapes[in_feature].stride / scale) for scale in scale_factors]
|
405 |
+
_assert_strides_are_log2_contiguous(strides)
|
406 |
+
|
407 |
+
dim = input_shapes[in_feature].channels
|
408 |
+
self.stages = []
|
409 |
+
use_bias = norm == ""
|
410 |
+
for idx, scale in enumerate(scale_factors):
|
411 |
+
out_dim = dim
|
412 |
+
if scale == 4.0:
|
413 |
+
layers = [
|
414 |
+
nn.ConvTranspose2d(dim, dim // 2, kernel_size=2, stride=2),
|
415 |
+
get_norm(norm, dim // 2),
|
416 |
+
nn.GELU(),
|
417 |
+
nn.ConvTranspose2d(dim // 2, dim // 4, kernel_size=2, stride=2),
|
418 |
+
]
|
419 |
+
out_dim = dim // 4
|
420 |
+
elif scale == 2.0:
|
421 |
+
layers = [nn.ConvTranspose2d(dim, dim // 2, kernel_size=2, stride=2)]
|
422 |
+
out_dim = dim // 2
|
423 |
+
elif scale == 1.0:
|
424 |
+
layers = []
|
425 |
+
elif scale == 0.5:
|
426 |
+
layers = [nn.MaxPool2d(kernel_size=2, stride=2)]
|
427 |
+
else:
|
428 |
+
raise NotImplementedError(f"scale_factor={scale} is not supported yet.")
|
429 |
+
|
430 |
+
layers.extend(
|
431 |
+
[
|
432 |
+
Conv2d(
|
433 |
+
out_dim,
|
434 |
+
out_channels,
|
435 |
+
kernel_size=1,
|
436 |
+
bias=use_bias,
|
437 |
+
norm=get_norm(norm, out_channels),
|
438 |
+
),
|
439 |
+
Conv2d(
|
440 |
+
out_channels,
|
441 |
+
out_channels,
|
442 |
+
kernel_size=3,
|
443 |
+
padding=1,
|
444 |
+
bias=use_bias,
|
445 |
+
norm=get_norm(norm, out_channels),
|
446 |
+
),
|
447 |
+
]
|
448 |
+
)
|
449 |
+
layers = nn.Sequential(*layers)
|
450 |
+
|
451 |
+
stage = int(math.log2(strides[idx]))
|
452 |
+
self.add_module(f"simfp_{stage}", layers)
|
453 |
+
self.stages.append(layers)
|
454 |
+
|
455 |
+
self.net = net
|
456 |
+
self.in_feature = in_feature
|
457 |
+
self.top_block = top_block
|
458 |
+
# Return feature names are "p<stage>", like ["p2", "p3", ..., "p6"]
|
459 |
+
self._out_feature_strides = {"p{}".format(int(math.log2(s))): s for s in strides}
|
460 |
+
# top block output feature maps.
|
461 |
+
if self.top_block is not None:
|
462 |
+
for s in range(stage, stage + self.top_block.num_levels):
|
463 |
+
self._out_feature_strides["p{}".format(s + 1)] = 2 ** (s + 1)
|
464 |
+
|
465 |
+
self._out_features = list(self._out_feature_strides.keys())
|
466 |
+
self._out_feature_channels = {k: out_channels for k in self._out_features}
|
467 |
+
self._size_divisibility = strides[-1]
|
468 |
+
self._square_pad = square_pad
|
469 |
+
|
470 |
+
@property
|
471 |
+
def padding_constraints(self):
|
472 |
+
return {
|
473 |
+
"size_divisiblity": self._size_divisibility,
|
474 |
+
"square_size": self._square_pad,
|
475 |
+
}
|
476 |
+
|
477 |
+
def forward(self, x):
|
478 |
+
"""
|
479 |
+
Args:
|
480 |
+
x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
|
481 |
+
|
482 |
+
Returns:
|
483 |
+
dict[str->Tensor]:
|
484 |
+
mapping from feature map name to pyramid feature map tensor
|
485 |
+
in high to low resolution order. Returned feature names follow the FPN
|
486 |
+
convention: "p<stage>", where stage has stride = 2 ** stage e.g.,
|
487 |
+
["p2", "p3", ..., "p6"].
|
488 |
+
"""
|
489 |
+
bottom_up_features = self.net(x)
|
490 |
+
features = bottom_up_features[self.in_feature]
|
491 |
+
results = []
|
492 |
+
|
493 |
+
for stage in self.stages:
|
494 |
+
results.append(stage(features))
|
495 |
+
|
496 |
+
if self.top_block is not None:
|
497 |
+
if self.top_block.in_feature in bottom_up_features:
|
498 |
+
top_block_in_feature = bottom_up_features[self.top_block.in_feature]
|
499 |
+
else:
|
500 |
+
top_block_in_feature = results[self._out_features.index(self.top_block.in_feature)]
|
501 |
+
results.extend(self.top_block(top_block_in_feature))
|
502 |
+
assert len(self._out_features) == len(results)
|
503 |
+
return {f: res for f, res in zip(self._out_features, results)}
|
504 |
+
|
505 |
+
|
506 |
+
def get_vit_lr_decay_rate(name, lr_decay_rate=1.0, num_layers=12):
|
507 |
+
"""
|
508 |
+
Calculate lr decay rate for different ViT blocks.
|
509 |
+
Args:
|
510 |
+
name (string): parameter name.
|
511 |
+
lr_decay_rate (float): base lr decay rate.
|
512 |
+
num_layers (int): number of ViT blocks.
|
513 |
+
|
514 |
+
Returns:
|
515 |
+
lr decay rate for the given parameter.
|
516 |
+
"""
|
517 |
+
layer_id = num_layers + 1
|
518 |
+
if name.startswith("backbone"):
|
519 |
+
if ".pos_embed" in name or ".patch_embed" in name:
|
520 |
+
layer_id = 0
|
521 |
+
elif ".blocks." in name and ".residual." not in name:
|
522 |
+
layer_id = int(name[name.find(".blocks.") :].split(".")[2]) + 1
|
523 |
+
|
524 |
+
return lr_decay_rate ** (num_layers + 1 - layer_id)
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/box_regression.py
ADDED
@@ -0,0 +1,369 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
import math
|
3 |
+
from typing import List, Tuple, Union
|
4 |
+
import torch
|
5 |
+
from fvcore.nn import giou_loss, smooth_l1_loss
|
6 |
+
from torch.nn import functional as F
|
7 |
+
|
8 |
+
from annotator.oneformer.detectron2.layers import cat, ciou_loss, diou_loss
|
9 |
+
from annotator.oneformer.detectron2.structures import Boxes
|
10 |
+
|
11 |
+
# Value for clamping large dw and dh predictions. The heuristic is that we clamp
|
12 |
+
# such that dw and dh are no larger than what would transform a 16px box into a
|
13 |
+
# 1000px box (based on a small anchor, 16px, and a typical image size, 1000px).
|
14 |
+
_DEFAULT_SCALE_CLAMP = math.log(1000.0 / 16)
|
15 |
+
|
16 |
+
|
17 |
+
__all__ = ["Box2BoxTransform", "Box2BoxTransformRotated", "Box2BoxTransformLinear"]
|
18 |
+
|
19 |
+
|
20 |
+
@torch.jit.script
|
21 |
+
class Box2BoxTransform(object):
|
22 |
+
"""
|
23 |
+
The box-to-box transform defined in R-CNN. The transformation is parameterized
|
24 |
+
by 4 deltas: (dx, dy, dw, dh). The transformation scales the box's width and height
|
25 |
+
by exp(dw), exp(dh) and shifts a box's center by the offset (dx * width, dy * height).
|
26 |
+
"""
|
27 |
+
|
28 |
+
def __init__(
|
29 |
+
self, weights: Tuple[float, float, float, float], scale_clamp: float = _DEFAULT_SCALE_CLAMP
|
30 |
+
):
|
31 |
+
"""
|
32 |
+
Args:
|
33 |
+
weights (4-element tuple): Scaling factors that are applied to the
|
34 |
+
(dx, dy, dw, dh) deltas. In Fast R-CNN, these were originally set
|
35 |
+
such that the deltas have unit variance; now they are treated as
|
36 |
+
hyperparameters of the system.
|
37 |
+
scale_clamp (float): When predicting deltas, the predicted box scaling
|
38 |
+
factors (dw and dh) are clamped such that they are <= scale_clamp.
|
39 |
+
"""
|
40 |
+
self.weights = weights
|
41 |
+
self.scale_clamp = scale_clamp
|
42 |
+
|
43 |
+
def get_deltas(self, src_boxes, target_boxes):
|
44 |
+
"""
|
45 |
+
Get box regression transformation deltas (dx, dy, dw, dh) that can be used
|
46 |
+
to transform the `src_boxes` into the `target_boxes`. That is, the relation
|
47 |
+
``target_boxes == self.apply_deltas(deltas, src_boxes)`` is true (unless
|
48 |
+
any delta is too large and is clamped).
|
49 |
+
|
50 |
+
Args:
|
51 |
+
src_boxes (Tensor): source boxes, e.g., object proposals
|
52 |
+
target_boxes (Tensor): target of the transformation, e.g., ground-truth
|
53 |
+
boxes.
|
54 |
+
"""
|
55 |
+
assert isinstance(src_boxes, torch.Tensor), type(src_boxes)
|
56 |
+
assert isinstance(target_boxes, torch.Tensor), type(target_boxes)
|
57 |
+
|
58 |
+
src_widths = src_boxes[:, 2] - src_boxes[:, 0]
|
59 |
+
src_heights = src_boxes[:, 3] - src_boxes[:, 1]
|
60 |
+
src_ctr_x = src_boxes[:, 0] + 0.5 * src_widths
|
61 |
+
src_ctr_y = src_boxes[:, 1] + 0.5 * src_heights
|
62 |
+
|
63 |
+
target_widths = target_boxes[:, 2] - target_boxes[:, 0]
|
64 |
+
target_heights = target_boxes[:, 3] - target_boxes[:, 1]
|
65 |
+
target_ctr_x = target_boxes[:, 0] + 0.5 * target_widths
|
66 |
+
target_ctr_y = target_boxes[:, 1] + 0.5 * target_heights
|
67 |
+
|
68 |
+
wx, wy, ww, wh = self.weights
|
69 |
+
dx = wx * (target_ctr_x - src_ctr_x) / src_widths
|
70 |
+
dy = wy * (target_ctr_y - src_ctr_y) / src_heights
|
71 |
+
dw = ww * torch.log(target_widths / src_widths)
|
72 |
+
dh = wh * torch.log(target_heights / src_heights)
|
73 |
+
|
74 |
+
deltas = torch.stack((dx, dy, dw, dh), dim=1)
|
75 |
+
assert (src_widths > 0).all().item(), "Input boxes to Box2BoxTransform are not valid!"
|
76 |
+
return deltas
|
77 |
+
|
78 |
+
def apply_deltas(self, deltas, boxes):
|
79 |
+
"""
|
80 |
+
Apply transformation `deltas` (dx, dy, dw, dh) to `boxes`.
|
81 |
+
|
82 |
+
Args:
|
83 |
+
deltas (Tensor): transformation deltas of shape (N, k*4), where k >= 1.
|
84 |
+
deltas[i] represents k potentially different class-specific
|
85 |
+
box transformations for the single box boxes[i].
|
86 |
+
boxes (Tensor): boxes to transform, of shape (N, 4)
|
87 |
+
"""
|
88 |
+
deltas = deltas.float() # ensure fp32 for decoding precision
|
89 |
+
boxes = boxes.to(deltas.dtype)
|
90 |
+
|
91 |
+
widths = boxes[:, 2] - boxes[:, 0]
|
92 |
+
heights = boxes[:, 3] - boxes[:, 1]
|
93 |
+
ctr_x = boxes[:, 0] + 0.5 * widths
|
94 |
+
ctr_y = boxes[:, 1] + 0.5 * heights
|
95 |
+
|
96 |
+
wx, wy, ww, wh = self.weights
|
97 |
+
dx = deltas[:, 0::4] / wx
|
98 |
+
dy = deltas[:, 1::4] / wy
|
99 |
+
dw = deltas[:, 2::4] / ww
|
100 |
+
dh = deltas[:, 3::4] / wh
|
101 |
+
|
102 |
+
# Prevent sending too large values into torch.exp()
|
103 |
+
dw = torch.clamp(dw, max=self.scale_clamp)
|
104 |
+
dh = torch.clamp(dh, max=self.scale_clamp)
|
105 |
+
|
106 |
+
pred_ctr_x = dx * widths[:, None] + ctr_x[:, None]
|
107 |
+
pred_ctr_y = dy * heights[:, None] + ctr_y[:, None]
|
108 |
+
pred_w = torch.exp(dw) * widths[:, None]
|
109 |
+
pred_h = torch.exp(dh) * heights[:, None]
|
110 |
+
|
111 |
+
x1 = pred_ctr_x - 0.5 * pred_w
|
112 |
+
y1 = pred_ctr_y - 0.5 * pred_h
|
113 |
+
x2 = pred_ctr_x + 0.5 * pred_w
|
114 |
+
y2 = pred_ctr_y + 0.5 * pred_h
|
115 |
+
pred_boxes = torch.stack((x1, y1, x2, y2), dim=-1)
|
116 |
+
return pred_boxes.reshape(deltas.shape)
|
117 |
+
|
118 |
+
|
119 |
+
@torch.jit.script
|
120 |
+
class Box2BoxTransformRotated(object):
|
121 |
+
"""
|
122 |
+
The box-to-box transform defined in Rotated R-CNN. The transformation is parameterized
|
123 |
+
by 5 deltas: (dx, dy, dw, dh, da). The transformation scales the box's width and height
|
124 |
+
by exp(dw), exp(dh), shifts a box's center by the offset (dx * width, dy * height),
|
125 |
+
and rotate a box's angle by da (radians).
|
126 |
+
Note: angles of deltas are in radians while angles of boxes are in degrees.
|
127 |
+
"""
|
128 |
+
|
129 |
+
def __init__(
|
130 |
+
self,
|
131 |
+
weights: Tuple[float, float, float, float, float],
|
132 |
+
scale_clamp: float = _DEFAULT_SCALE_CLAMP,
|
133 |
+
):
|
134 |
+
"""
|
135 |
+
Args:
|
136 |
+
weights (5-element tuple): Scaling factors that are applied to the
|
137 |
+
(dx, dy, dw, dh, da) deltas. These are treated as
|
138 |
+
hyperparameters of the system.
|
139 |
+
scale_clamp (float): When predicting deltas, the predicted box scaling
|
140 |
+
factors (dw and dh) are clamped such that they are <= scale_clamp.
|
141 |
+
"""
|
142 |
+
self.weights = weights
|
143 |
+
self.scale_clamp = scale_clamp
|
144 |
+
|
145 |
+
def get_deltas(self, src_boxes, target_boxes):
|
146 |
+
"""
|
147 |
+
Get box regression transformation deltas (dx, dy, dw, dh, da) that can be used
|
148 |
+
to transform the `src_boxes` into the `target_boxes`. That is, the relation
|
149 |
+
``target_boxes == self.apply_deltas(deltas, src_boxes)`` is true (unless
|
150 |
+
any delta is too large and is clamped).
|
151 |
+
|
152 |
+
Args:
|
153 |
+
src_boxes (Tensor): Nx5 source boxes, e.g., object proposals
|
154 |
+
target_boxes (Tensor): Nx5 target of the transformation, e.g., ground-truth
|
155 |
+
boxes.
|
156 |
+
"""
|
157 |
+
assert isinstance(src_boxes, torch.Tensor), type(src_boxes)
|
158 |
+
assert isinstance(target_boxes, torch.Tensor), type(target_boxes)
|
159 |
+
|
160 |
+
src_ctr_x, src_ctr_y, src_widths, src_heights, src_angles = torch.unbind(src_boxes, dim=1)
|
161 |
+
|
162 |
+
target_ctr_x, target_ctr_y, target_widths, target_heights, target_angles = torch.unbind(
|
163 |
+
target_boxes, dim=1
|
164 |
+
)
|
165 |
+
|
166 |
+
wx, wy, ww, wh, wa = self.weights
|
167 |
+
dx = wx * (target_ctr_x - src_ctr_x) / src_widths
|
168 |
+
dy = wy * (target_ctr_y - src_ctr_y) / src_heights
|
169 |
+
dw = ww * torch.log(target_widths / src_widths)
|
170 |
+
dh = wh * torch.log(target_heights / src_heights)
|
171 |
+
# Angles of deltas are in radians while angles of boxes are in degrees.
|
172 |
+
# the conversion to radians serve as a way to normalize the values
|
173 |
+
da = target_angles - src_angles
|
174 |
+
da = (da + 180.0) % 360.0 - 180.0 # make it in [-180, 180)
|
175 |
+
da *= wa * math.pi / 180.0
|
176 |
+
|
177 |
+
deltas = torch.stack((dx, dy, dw, dh, da), dim=1)
|
178 |
+
assert (
|
179 |
+
(src_widths > 0).all().item()
|
180 |
+
), "Input boxes to Box2BoxTransformRotated are not valid!"
|
181 |
+
return deltas
|
182 |
+
|
183 |
+
def apply_deltas(self, deltas, boxes):
|
184 |
+
"""
|
185 |
+
Apply transformation `deltas` (dx, dy, dw, dh, da) to `boxes`.
|
186 |
+
|
187 |
+
Args:
|
188 |
+
deltas (Tensor): transformation deltas of shape (N, k*5).
|
189 |
+
deltas[i] represents box transformation for the single box boxes[i].
|
190 |
+
boxes (Tensor): boxes to transform, of shape (N, 5)
|
191 |
+
"""
|
192 |
+
assert deltas.shape[1] % 5 == 0 and boxes.shape[1] == 5
|
193 |
+
|
194 |
+
boxes = boxes.to(deltas.dtype).unsqueeze(2)
|
195 |
+
|
196 |
+
ctr_x = boxes[:, 0]
|
197 |
+
ctr_y = boxes[:, 1]
|
198 |
+
widths = boxes[:, 2]
|
199 |
+
heights = boxes[:, 3]
|
200 |
+
angles = boxes[:, 4]
|
201 |
+
|
202 |
+
wx, wy, ww, wh, wa = self.weights
|
203 |
+
|
204 |
+
dx = deltas[:, 0::5] / wx
|
205 |
+
dy = deltas[:, 1::5] / wy
|
206 |
+
dw = deltas[:, 2::5] / ww
|
207 |
+
dh = deltas[:, 3::5] / wh
|
208 |
+
da = deltas[:, 4::5] / wa
|
209 |
+
|
210 |
+
# Prevent sending too large values into torch.exp()
|
211 |
+
dw = torch.clamp(dw, max=self.scale_clamp)
|
212 |
+
dh = torch.clamp(dh, max=self.scale_clamp)
|
213 |
+
|
214 |
+
pred_boxes = torch.zeros_like(deltas)
|
215 |
+
pred_boxes[:, 0::5] = dx * widths + ctr_x # x_ctr
|
216 |
+
pred_boxes[:, 1::5] = dy * heights + ctr_y # y_ctr
|
217 |
+
pred_boxes[:, 2::5] = torch.exp(dw) * widths # width
|
218 |
+
pred_boxes[:, 3::5] = torch.exp(dh) * heights # height
|
219 |
+
|
220 |
+
# Following original RRPN implementation,
|
221 |
+
# angles of deltas are in radians while angles of boxes are in degrees.
|
222 |
+
pred_angle = da * 180.0 / math.pi + angles
|
223 |
+
pred_angle = (pred_angle + 180.0) % 360.0 - 180.0 # make it in [-180, 180)
|
224 |
+
|
225 |
+
pred_boxes[:, 4::5] = pred_angle
|
226 |
+
|
227 |
+
return pred_boxes
|
228 |
+
|
229 |
+
|
230 |
+
class Box2BoxTransformLinear(object):
|
231 |
+
"""
|
232 |
+
The linear box-to-box transform defined in FCOS. The transformation is parameterized
|
233 |
+
by the distance from the center of (square) src box to 4 edges of the target box.
|
234 |
+
"""
|
235 |
+
|
236 |
+
def __init__(self, normalize_by_size=True):
|
237 |
+
"""
|
238 |
+
Args:
|
239 |
+
normalize_by_size: normalize deltas by the size of src (anchor) boxes.
|
240 |
+
"""
|
241 |
+
self.normalize_by_size = normalize_by_size
|
242 |
+
|
243 |
+
def get_deltas(self, src_boxes, target_boxes):
|
244 |
+
"""
|
245 |
+
Get box regression transformation deltas (dx1, dy1, dx2, dy2) that can be used
|
246 |
+
to transform the `src_boxes` into the `target_boxes`. That is, the relation
|
247 |
+
``target_boxes == self.apply_deltas(deltas, src_boxes)`` is true.
|
248 |
+
The center of src must be inside target boxes.
|
249 |
+
|
250 |
+
Args:
|
251 |
+
src_boxes (Tensor): square source boxes, e.g., anchors
|
252 |
+
target_boxes (Tensor): target of the transformation, e.g., ground-truth
|
253 |
+
boxes.
|
254 |
+
"""
|
255 |
+
assert isinstance(src_boxes, torch.Tensor), type(src_boxes)
|
256 |
+
assert isinstance(target_boxes, torch.Tensor), type(target_boxes)
|
257 |
+
|
258 |
+
src_ctr_x = 0.5 * (src_boxes[:, 0] + src_boxes[:, 2])
|
259 |
+
src_ctr_y = 0.5 * (src_boxes[:, 1] + src_boxes[:, 3])
|
260 |
+
|
261 |
+
target_l = src_ctr_x - target_boxes[:, 0]
|
262 |
+
target_t = src_ctr_y - target_boxes[:, 1]
|
263 |
+
target_r = target_boxes[:, 2] - src_ctr_x
|
264 |
+
target_b = target_boxes[:, 3] - src_ctr_y
|
265 |
+
|
266 |
+
deltas = torch.stack((target_l, target_t, target_r, target_b), dim=1)
|
267 |
+
if self.normalize_by_size:
|
268 |
+
stride_w = src_boxes[:, 2] - src_boxes[:, 0]
|
269 |
+
stride_h = src_boxes[:, 3] - src_boxes[:, 1]
|
270 |
+
strides = torch.stack([stride_w, stride_h, stride_w, stride_h], axis=1)
|
271 |
+
deltas = deltas / strides
|
272 |
+
|
273 |
+
return deltas
|
274 |
+
|
275 |
+
def apply_deltas(self, deltas, boxes):
|
276 |
+
"""
|
277 |
+
Apply transformation `deltas` (dx1, dy1, dx2, dy2) to `boxes`.
|
278 |
+
|
279 |
+
Args:
|
280 |
+
deltas (Tensor): transformation deltas of shape (N, k*4), where k >= 1.
|
281 |
+
deltas[i] represents k potentially different class-specific
|
282 |
+
box transformations for the single box boxes[i].
|
283 |
+
boxes (Tensor): boxes to transform, of shape (N, 4)
|
284 |
+
"""
|
285 |
+
# Ensure the output is a valid box. See Sec 2.1 of https://arxiv.org/abs/2006.09214
|
286 |
+
deltas = F.relu(deltas)
|
287 |
+
boxes = boxes.to(deltas.dtype)
|
288 |
+
|
289 |
+
ctr_x = 0.5 * (boxes[:, 0] + boxes[:, 2])
|
290 |
+
ctr_y = 0.5 * (boxes[:, 1] + boxes[:, 3])
|
291 |
+
if self.normalize_by_size:
|
292 |
+
stride_w = boxes[:, 2] - boxes[:, 0]
|
293 |
+
stride_h = boxes[:, 3] - boxes[:, 1]
|
294 |
+
strides = torch.stack([stride_w, stride_h, stride_w, stride_h], axis=1)
|
295 |
+
deltas = deltas * strides
|
296 |
+
|
297 |
+
l = deltas[:, 0::4]
|
298 |
+
t = deltas[:, 1::4]
|
299 |
+
r = deltas[:, 2::4]
|
300 |
+
b = deltas[:, 3::4]
|
301 |
+
|
302 |
+
pred_boxes = torch.zeros_like(deltas)
|
303 |
+
pred_boxes[:, 0::4] = ctr_x[:, None] - l # x1
|
304 |
+
pred_boxes[:, 1::4] = ctr_y[:, None] - t # y1
|
305 |
+
pred_boxes[:, 2::4] = ctr_x[:, None] + r # x2
|
306 |
+
pred_boxes[:, 3::4] = ctr_y[:, None] + b # y2
|
307 |
+
return pred_boxes
|
308 |
+
|
309 |
+
|
310 |
+
def _dense_box_regression_loss(
|
311 |
+
anchors: List[Union[Boxes, torch.Tensor]],
|
312 |
+
box2box_transform: Box2BoxTransform,
|
313 |
+
pred_anchor_deltas: List[torch.Tensor],
|
314 |
+
gt_boxes: List[torch.Tensor],
|
315 |
+
fg_mask: torch.Tensor,
|
316 |
+
box_reg_loss_type="smooth_l1",
|
317 |
+
smooth_l1_beta=0.0,
|
318 |
+
):
|
319 |
+
"""
|
320 |
+
Compute loss for dense multi-level box regression.
|
321 |
+
Loss is accumulated over ``fg_mask``.
|
322 |
+
|
323 |
+
Args:
|
324 |
+
anchors: #lvl anchor boxes, each is (HixWixA, 4)
|
325 |
+
pred_anchor_deltas: #lvl predictions, each is (N, HixWixA, 4)
|
326 |
+
gt_boxes: N ground truth boxes, each has shape (R, 4) (R = sum(Hi * Wi * A))
|
327 |
+
fg_mask: the foreground boolean mask of shape (N, R) to compute loss on
|
328 |
+
box_reg_loss_type (str): Loss type to use. Supported losses: "smooth_l1", "giou",
|
329 |
+
"diou", "ciou".
|
330 |
+
smooth_l1_beta (float): beta parameter for the smooth L1 regression loss. Default to
|
331 |
+
use L1 loss. Only used when `box_reg_loss_type` is "smooth_l1"
|
332 |
+
"""
|
333 |
+
if isinstance(anchors[0], Boxes):
|
334 |
+
anchors = type(anchors[0]).cat(anchors).tensor # (R, 4)
|
335 |
+
else:
|
336 |
+
anchors = cat(anchors)
|
337 |
+
if box_reg_loss_type == "smooth_l1":
|
338 |
+
gt_anchor_deltas = [box2box_transform.get_deltas(anchors, k) for k in gt_boxes]
|
339 |
+
gt_anchor_deltas = torch.stack(gt_anchor_deltas) # (N, R, 4)
|
340 |
+
loss_box_reg = smooth_l1_loss(
|
341 |
+
cat(pred_anchor_deltas, dim=1)[fg_mask],
|
342 |
+
gt_anchor_deltas[fg_mask],
|
343 |
+
beta=smooth_l1_beta,
|
344 |
+
reduction="sum",
|
345 |
+
)
|
346 |
+
elif box_reg_loss_type == "giou":
|
347 |
+
pred_boxes = [
|
348 |
+
box2box_transform.apply_deltas(k, anchors) for k in cat(pred_anchor_deltas, dim=1)
|
349 |
+
]
|
350 |
+
loss_box_reg = giou_loss(
|
351 |
+
torch.stack(pred_boxes)[fg_mask], torch.stack(gt_boxes)[fg_mask], reduction="sum"
|
352 |
+
)
|
353 |
+
elif box_reg_loss_type == "diou":
|
354 |
+
pred_boxes = [
|
355 |
+
box2box_transform.apply_deltas(k, anchors) for k in cat(pred_anchor_deltas, dim=1)
|
356 |
+
]
|
357 |
+
loss_box_reg = diou_loss(
|
358 |
+
torch.stack(pred_boxes)[fg_mask], torch.stack(gt_boxes)[fg_mask], reduction="sum"
|
359 |
+
)
|
360 |
+
elif box_reg_loss_type == "ciou":
|
361 |
+
pred_boxes = [
|
362 |
+
box2box_transform.apply_deltas(k, anchors) for k in cat(pred_anchor_deltas, dim=1)
|
363 |
+
]
|
364 |
+
loss_box_reg = ciou_loss(
|
365 |
+
torch.stack(pred_boxes)[fg_mask], torch.stack(gt_boxes)[fg_mask], reduction="sum"
|
366 |
+
)
|
367 |
+
else:
|
368 |
+
raise ValueError(f"Invalid dense box regression loss type '{box_reg_loss_type}'")
|
369 |
+
return loss_box_reg
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/matcher.py
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
from typing import List
|
3 |
+
import torch
|
4 |
+
|
5 |
+
from annotator.oneformer.detectron2.layers import nonzero_tuple
|
6 |
+
|
7 |
+
|
8 |
+
# TODO: the name is too general
|
9 |
+
class Matcher(object):
|
10 |
+
"""
|
11 |
+
This class assigns to each predicted "element" (e.g., a box) a ground-truth
|
12 |
+
element. Each predicted element will have exactly zero or one matches; each
|
13 |
+
ground-truth element may be matched to zero or more predicted elements.
|
14 |
+
|
15 |
+
The matching is determined by the MxN match_quality_matrix, that characterizes
|
16 |
+
how well each (ground-truth, prediction)-pair match each other. For example,
|
17 |
+
if the elements are boxes, this matrix may contain box intersection-over-union
|
18 |
+
overlap values.
|
19 |
+
|
20 |
+
The matcher returns (a) a vector of length N containing the index of the
|
21 |
+
ground-truth element m in [0, M) that matches to prediction n in [0, N).
|
22 |
+
(b) a vector of length N containing the labels for each prediction.
|
23 |
+
"""
|
24 |
+
|
25 |
+
def __init__(
|
26 |
+
self, thresholds: List[float], labels: List[int], allow_low_quality_matches: bool = False
|
27 |
+
):
|
28 |
+
"""
|
29 |
+
Args:
|
30 |
+
thresholds (list): a list of thresholds used to stratify predictions
|
31 |
+
into levels.
|
32 |
+
labels (list): a list of values to label predictions belonging at
|
33 |
+
each level. A label can be one of {-1, 0, 1} signifying
|
34 |
+
{ignore, negative class, positive class}, respectively.
|
35 |
+
allow_low_quality_matches (bool): if True, produce additional matches
|
36 |
+
for predictions with maximum match quality lower than high_threshold.
|
37 |
+
See set_low_quality_matches_ for more details.
|
38 |
+
|
39 |
+
For example,
|
40 |
+
thresholds = [0.3, 0.5]
|
41 |
+
labels = [0, -1, 1]
|
42 |
+
All predictions with iou < 0.3 will be marked with 0 and
|
43 |
+
thus will be considered as false positives while training.
|
44 |
+
All predictions with 0.3 <= iou < 0.5 will be marked with -1 and
|
45 |
+
thus will be ignored.
|
46 |
+
All predictions with 0.5 <= iou will be marked with 1 and
|
47 |
+
thus will be considered as true positives.
|
48 |
+
"""
|
49 |
+
# Add -inf and +inf to first and last position in thresholds
|
50 |
+
thresholds = thresholds[:]
|
51 |
+
assert thresholds[0] > 0
|
52 |
+
thresholds.insert(0, -float("inf"))
|
53 |
+
thresholds.append(float("inf"))
|
54 |
+
# Currently torchscript does not support all + generator
|
55 |
+
assert all([low <= high for (low, high) in zip(thresholds[:-1], thresholds[1:])])
|
56 |
+
assert all([l in [-1, 0, 1] for l in labels])
|
57 |
+
assert len(labels) == len(thresholds) - 1
|
58 |
+
self.thresholds = thresholds
|
59 |
+
self.labels = labels
|
60 |
+
self.allow_low_quality_matches = allow_low_quality_matches
|
61 |
+
|
62 |
+
def __call__(self, match_quality_matrix):
|
63 |
+
"""
|
64 |
+
Args:
|
65 |
+
match_quality_matrix (Tensor[float]): an MxN tensor, containing the
|
66 |
+
pairwise quality between M ground-truth elements and N predicted
|
67 |
+
elements. All elements must be >= 0 (due to the us of `torch.nonzero`
|
68 |
+
for selecting indices in :meth:`set_low_quality_matches_`).
|
69 |
+
|
70 |
+
Returns:
|
71 |
+
matches (Tensor[int64]): a vector of length N, where matches[i] is a matched
|
72 |
+
ground-truth index in [0, M)
|
73 |
+
match_labels (Tensor[int8]): a vector of length N, where pred_labels[i] indicates
|
74 |
+
whether a prediction is a true or false positive or ignored
|
75 |
+
"""
|
76 |
+
assert match_quality_matrix.dim() == 2
|
77 |
+
if match_quality_matrix.numel() == 0:
|
78 |
+
default_matches = match_quality_matrix.new_full(
|
79 |
+
(match_quality_matrix.size(1),), 0, dtype=torch.int64
|
80 |
+
)
|
81 |
+
# When no gt boxes exist, we define IOU = 0 and therefore set labels
|
82 |
+
# to `self.labels[0]`, which usually defaults to background class 0
|
83 |
+
# To choose to ignore instead, can make labels=[-1,0,-1,1] + set appropriate thresholds
|
84 |
+
default_match_labels = match_quality_matrix.new_full(
|
85 |
+
(match_quality_matrix.size(1),), self.labels[0], dtype=torch.int8
|
86 |
+
)
|
87 |
+
return default_matches, default_match_labels
|
88 |
+
|
89 |
+
assert torch.all(match_quality_matrix >= 0)
|
90 |
+
|
91 |
+
# match_quality_matrix is M (gt) x N (predicted)
|
92 |
+
# Max over gt elements (dim 0) to find best gt candidate for each prediction
|
93 |
+
matched_vals, matches = match_quality_matrix.max(dim=0)
|
94 |
+
|
95 |
+
match_labels = matches.new_full(matches.size(), 1, dtype=torch.int8)
|
96 |
+
|
97 |
+
for (l, low, high) in zip(self.labels, self.thresholds[:-1], self.thresholds[1:]):
|
98 |
+
low_high = (matched_vals >= low) & (matched_vals < high)
|
99 |
+
match_labels[low_high] = l
|
100 |
+
|
101 |
+
if self.allow_low_quality_matches:
|
102 |
+
self.set_low_quality_matches_(match_labels, match_quality_matrix)
|
103 |
+
|
104 |
+
return matches, match_labels
|
105 |
+
|
106 |
+
def set_low_quality_matches_(self, match_labels, match_quality_matrix):
|
107 |
+
"""
|
108 |
+
Produce additional matches for predictions that have only low-quality matches.
|
109 |
+
Specifically, for each ground-truth G find the set of predictions that have
|
110 |
+
maximum overlap with it (including ties); for each prediction in that set, if
|
111 |
+
it is unmatched, then match it to the ground-truth G.
|
112 |
+
|
113 |
+
This function implements the RPN assignment case (i) in Sec. 3.1.2 of
|
114 |
+
:paper:`Faster R-CNN`.
|
115 |
+
"""
|
116 |
+
# For each gt, find the prediction with which it has highest quality
|
117 |
+
highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)
|
118 |
+
# Find the highest quality match available, even if it is low, including ties.
|
119 |
+
# Note that the matches qualities must be positive due to the use of
|
120 |
+
# `torch.nonzero`.
|
121 |
+
_, pred_inds_with_highest_quality = nonzero_tuple(
|
122 |
+
match_quality_matrix == highest_quality_foreach_gt[:, None]
|
123 |
+
)
|
124 |
+
# If an anchor was labeled positive only due to a low-quality match
|
125 |
+
# with gt_A, but it has larger overlap with gt_B, it's matched index will still be gt_B.
|
126 |
+
# This follows the implementation in Detectron, and is found to have no significant impact.
|
127 |
+
match_labels[pred_inds_with_highest_quality] = 1
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/__init__.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
3 |
+
|
4 |
+
from .build import META_ARCH_REGISTRY, build_model # isort:skip
|
5 |
+
|
6 |
+
from .panoptic_fpn import PanopticFPN
|
7 |
+
|
8 |
+
# import all the meta_arch, so they will be registered
|
9 |
+
from .rcnn import GeneralizedRCNN, ProposalNetwork
|
10 |
+
from .dense_detector import DenseDetector
|
11 |
+
from .retinanet import RetinaNet
|
12 |
+
from .fcos import FCOS
|
13 |
+
from .semantic_seg import SEM_SEG_HEADS_REGISTRY, SemanticSegmentor, build_sem_seg_head
|
14 |
+
|
15 |
+
|
16 |
+
__all__ = list(globals().keys())
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/build.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
import torch
|
3 |
+
|
4 |
+
from annotator.oneformer.detectron2.utils.logger import _log_api_usage
|
5 |
+
from annotator.oneformer.detectron2.utils.registry import Registry
|
6 |
+
|
7 |
+
META_ARCH_REGISTRY = Registry("META_ARCH") # noqa F401 isort:skip
|
8 |
+
META_ARCH_REGISTRY.__doc__ = """
|
9 |
+
Registry for meta-architectures, i.e. the whole model.
|
10 |
+
|
11 |
+
The registered object will be called with `obj(cfg)`
|
12 |
+
and expected to return a `nn.Module` object.
|
13 |
+
"""
|
14 |
+
|
15 |
+
|
16 |
+
def build_model(cfg):
|
17 |
+
"""
|
18 |
+
Build the whole model architecture, defined by ``cfg.MODEL.META_ARCHITECTURE``.
|
19 |
+
Note that it does not load any weights from ``cfg``.
|
20 |
+
"""
|
21 |
+
meta_arch = cfg.MODEL.META_ARCHITECTURE
|
22 |
+
model = META_ARCH_REGISTRY.get(meta_arch)(cfg)
|
23 |
+
_log_api_usage("modeling.meta_arch." + meta_arch)
|
24 |
+
return model
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/dense_detector.py
ADDED
@@ -0,0 +1,294 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from typing import Dict, List, Optional, Tuple
|
3 |
+
import torch
|
4 |
+
from torch import Tensor, nn
|
5 |
+
|
6 |
+
from annotator.oneformer.detectron2.data.detection_utils import convert_image_to_rgb
|
7 |
+
from annotator.oneformer.detectron2.layers import move_device_like
|
8 |
+
from annotator.oneformer.detectron2.modeling import Backbone
|
9 |
+
from annotator.oneformer.detectron2.structures import Boxes, ImageList, Instances
|
10 |
+
from annotator.oneformer.detectron2.utils.events import get_event_storage
|
11 |
+
|
12 |
+
from ..postprocessing import detector_postprocess
|
13 |
+
|
14 |
+
|
15 |
+
def permute_to_N_HWA_K(tensor, K: int):
|
16 |
+
"""
|
17 |
+
Transpose/reshape a tensor from (N, (Ai x K), H, W) to (N, (HxWxAi), K)
|
18 |
+
"""
|
19 |
+
assert tensor.dim() == 4, tensor.shape
|
20 |
+
N, _, H, W = tensor.shape
|
21 |
+
tensor = tensor.view(N, -1, K, H, W)
|
22 |
+
tensor = tensor.permute(0, 3, 4, 1, 2)
|
23 |
+
tensor = tensor.reshape(N, -1, K) # Size=(N,HWA,K)
|
24 |
+
return tensor
|
25 |
+
|
26 |
+
|
27 |
+
class DenseDetector(nn.Module):
|
28 |
+
"""
|
29 |
+
Base class for dense detector. We define a dense detector as a fully-convolutional model that
|
30 |
+
makes per-pixel (i.e. dense) predictions.
|
31 |
+
"""
|
32 |
+
|
33 |
+
def __init__(
|
34 |
+
self,
|
35 |
+
backbone: Backbone,
|
36 |
+
head: nn.Module,
|
37 |
+
head_in_features: Optional[List[str]] = None,
|
38 |
+
*,
|
39 |
+
pixel_mean,
|
40 |
+
pixel_std,
|
41 |
+
):
|
42 |
+
"""
|
43 |
+
Args:
|
44 |
+
backbone: backbone module
|
45 |
+
head: head module
|
46 |
+
head_in_features: backbone features to use in head. Default to all backbone features.
|
47 |
+
pixel_mean (Tuple[float]):
|
48 |
+
Values to be used for image normalization (BGR order).
|
49 |
+
To train on images of different number of channels, set different mean & std.
|
50 |
+
Default values are the mean pixel value from ImageNet: [103.53, 116.28, 123.675]
|
51 |
+
pixel_std (Tuple[float]):
|
52 |
+
When using pre-trained models in Detectron1 or any MSRA models,
|
53 |
+
std has been absorbed into its conv1 weights, so the std needs to be set 1.
|
54 |
+
Otherwise, you can use [57.375, 57.120, 58.395] (ImageNet std)
|
55 |
+
"""
|
56 |
+
super().__init__()
|
57 |
+
|
58 |
+
self.backbone = backbone
|
59 |
+
self.head = head
|
60 |
+
if head_in_features is None:
|
61 |
+
shapes = self.backbone.output_shape()
|
62 |
+
self.head_in_features = sorted(shapes.keys(), key=lambda x: shapes[x].stride)
|
63 |
+
else:
|
64 |
+
self.head_in_features = head_in_features
|
65 |
+
self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
|
66 |
+
self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
|
67 |
+
|
68 |
+
@property
|
69 |
+
def device(self):
|
70 |
+
return self.pixel_mean.device
|
71 |
+
|
72 |
+
def _move_to_current_device(self, x):
|
73 |
+
return move_device_like(x, self.pixel_mean)
|
74 |
+
|
75 |
+
def forward(self, batched_inputs: List[Dict[str, Tensor]]):
|
76 |
+
"""
|
77 |
+
Args:
|
78 |
+
batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
|
79 |
+
Each item in the list contains the inputs for one image.
|
80 |
+
For now, each item in the list is a dict that contains:
|
81 |
+
|
82 |
+
* image: Tensor, image in (C, H, W) format.
|
83 |
+
* instances: Instances
|
84 |
+
|
85 |
+
Other information that's included in the original dicts, such as:
|
86 |
+
|
87 |
+
* "height", "width" (int): the output resolution of the model, used in inference.
|
88 |
+
See :meth:`postprocess` for details.
|
89 |
+
|
90 |
+
Returns:
|
91 |
+
In training, dict[str, Tensor]: mapping from a named loss to a tensor storing the
|
92 |
+
loss. Used during training only. In inference, the standard output format, described
|
93 |
+
in :doc:`/tutorials/models`.
|
94 |
+
"""
|
95 |
+
images = self.preprocess_image(batched_inputs)
|
96 |
+
features = self.backbone(images.tensor)
|
97 |
+
features = [features[f] for f in self.head_in_features]
|
98 |
+
predictions = self.head(features)
|
99 |
+
|
100 |
+
if self.training:
|
101 |
+
assert not torch.jit.is_scripting(), "Not supported"
|
102 |
+
assert "instances" in batched_inputs[0], "Instance annotations are missing in training!"
|
103 |
+
gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
|
104 |
+
return self.forward_training(images, features, predictions, gt_instances)
|
105 |
+
else:
|
106 |
+
results = self.forward_inference(images, features, predictions)
|
107 |
+
if torch.jit.is_scripting():
|
108 |
+
return results
|
109 |
+
|
110 |
+
processed_results = []
|
111 |
+
for results_per_image, input_per_image, image_size in zip(
|
112 |
+
results, batched_inputs, images.image_sizes
|
113 |
+
):
|
114 |
+
height = input_per_image.get("height", image_size[0])
|
115 |
+
width = input_per_image.get("width", image_size[1])
|
116 |
+
r = detector_postprocess(results_per_image, height, width)
|
117 |
+
processed_results.append({"instances": r})
|
118 |
+
return processed_results
|
119 |
+
|
120 |
+
def forward_training(self, images, features, predictions, gt_instances):
|
121 |
+
raise NotImplementedError()
|
122 |
+
|
123 |
+
def preprocess_image(self, batched_inputs: List[Dict[str, Tensor]]):
|
124 |
+
"""
|
125 |
+
Normalize, pad and batch the input images.
|
126 |
+
"""
|
127 |
+
images = [self._move_to_current_device(x["image"]) for x in batched_inputs]
|
128 |
+
images = [(x - self.pixel_mean) / self.pixel_std for x in images]
|
129 |
+
images = ImageList.from_tensors(
|
130 |
+
images,
|
131 |
+
self.backbone.size_divisibility,
|
132 |
+
padding_constraints=self.backbone.padding_constraints,
|
133 |
+
)
|
134 |
+
return images
|
135 |
+
|
136 |
+
def _transpose_dense_predictions(
|
137 |
+
self, predictions: List[List[Tensor]], dims_per_anchor: List[int]
|
138 |
+
) -> List[List[Tensor]]:
|
139 |
+
"""
|
140 |
+
Transpose the dense per-level predictions.
|
141 |
+
|
142 |
+
Args:
|
143 |
+
predictions: a list of outputs, each is a list of per-level
|
144 |
+
predictions with shape (N, Ai x K, Hi, Wi), where N is the
|
145 |
+
number of images, Ai is the number of anchors per location on
|
146 |
+
level i, K is the dimension of predictions per anchor.
|
147 |
+
dims_per_anchor: the value of K for each predictions. e.g. 4 for
|
148 |
+
box prediction, #classes for classification prediction.
|
149 |
+
|
150 |
+
Returns:
|
151 |
+
List[List[Tensor]]: each prediction is transposed to (N, Hi x Wi x Ai, K).
|
152 |
+
"""
|
153 |
+
assert len(predictions) == len(dims_per_anchor)
|
154 |
+
res: List[List[Tensor]] = []
|
155 |
+
for pred, dim_per_anchor in zip(predictions, dims_per_anchor):
|
156 |
+
pred = [permute_to_N_HWA_K(x, dim_per_anchor) for x in pred]
|
157 |
+
res.append(pred)
|
158 |
+
return res
|
159 |
+
|
160 |
+
def _ema_update(self, name: str, value: float, initial_value: float, momentum: float = 0.9):
|
161 |
+
"""
|
162 |
+
Apply EMA update to `self.name` using `value`.
|
163 |
+
|
164 |
+
This is mainly used for loss normalizer. In Detectron1, loss is normalized by number
|
165 |
+
of foreground samples in the batch. When batch size is 1 per GPU, #foreground has a
|
166 |
+
large variance and using it lead to lower performance. Therefore we maintain an EMA of
|
167 |
+
#foreground to stabilize the normalizer.
|
168 |
+
|
169 |
+
Args:
|
170 |
+
name: name of the normalizer
|
171 |
+
value: the new value to update
|
172 |
+
initial_value: the initial value to start with
|
173 |
+
momentum: momentum of EMA
|
174 |
+
|
175 |
+
Returns:
|
176 |
+
float: the updated EMA value
|
177 |
+
"""
|
178 |
+
if hasattr(self, name):
|
179 |
+
old = getattr(self, name)
|
180 |
+
else:
|
181 |
+
old = initial_value
|
182 |
+
new = old * momentum + value * (1 - momentum)
|
183 |
+
setattr(self, name, new)
|
184 |
+
return new
|
185 |
+
|
186 |
+
def _decode_per_level_predictions(
|
187 |
+
self,
|
188 |
+
anchors: Boxes,
|
189 |
+
pred_scores: Tensor,
|
190 |
+
pred_deltas: Tensor,
|
191 |
+
score_thresh: float,
|
192 |
+
topk_candidates: int,
|
193 |
+
image_size: Tuple[int, int],
|
194 |
+
) -> Instances:
|
195 |
+
"""
|
196 |
+
Decode boxes and classification predictions of one featuer level, by
|
197 |
+
the following steps:
|
198 |
+
1. filter the predictions based on score threshold and top K scores.
|
199 |
+
2. transform the box regression outputs
|
200 |
+
3. return the predicted scores, classes and boxes
|
201 |
+
|
202 |
+
Args:
|
203 |
+
anchors: Boxes, anchor for this feature level
|
204 |
+
pred_scores: HxWxA,K
|
205 |
+
pred_deltas: HxWxA,4
|
206 |
+
|
207 |
+
Returns:
|
208 |
+
Instances: with field "scores", "pred_boxes", "pred_classes".
|
209 |
+
"""
|
210 |
+
# Apply two filtering to make NMS faster.
|
211 |
+
# 1. Keep boxes with confidence score higher than threshold
|
212 |
+
keep_idxs = pred_scores > score_thresh
|
213 |
+
pred_scores = pred_scores[keep_idxs]
|
214 |
+
topk_idxs = torch.nonzero(keep_idxs) # Kx2
|
215 |
+
|
216 |
+
# 2. Keep top k top scoring boxes only
|
217 |
+
topk_idxs_size = topk_idxs.shape[0]
|
218 |
+
if isinstance(topk_idxs_size, Tensor):
|
219 |
+
# It's a tensor in tracing
|
220 |
+
num_topk = torch.clamp(topk_idxs_size, max=topk_candidates)
|
221 |
+
else:
|
222 |
+
num_topk = min(topk_idxs_size, topk_candidates)
|
223 |
+
pred_scores, idxs = pred_scores.topk(num_topk)
|
224 |
+
topk_idxs = topk_idxs[idxs]
|
225 |
+
|
226 |
+
anchor_idxs, classes_idxs = topk_idxs.unbind(dim=1)
|
227 |
+
|
228 |
+
pred_boxes = self.box2box_transform.apply_deltas(
|
229 |
+
pred_deltas[anchor_idxs], anchors.tensor[anchor_idxs]
|
230 |
+
)
|
231 |
+
return Instances(
|
232 |
+
image_size, pred_boxes=Boxes(pred_boxes), scores=pred_scores, pred_classes=classes_idxs
|
233 |
+
)
|
234 |
+
|
235 |
+
def _decode_multi_level_predictions(
|
236 |
+
self,
|
237 |
+
anchors: List[Boxes],
|
238 |
+
pred_scores: List[Tensor],
|
239 |
+
pred_deltas: List[Tensor],
|
240 |
+
score_thresh: float,
|
241 |
+
topk_candidates: int,
|
242 |
+
image_size: Tuple[int, int],
|
243 |
+
) -> Instances:
|
244 |
+
"""
|
245 |
+
Run `_decode_per_level_predictions` for all feature levels and concat the results.
|
246 |
+
"""
|
247 |
+
predictions = [
|
248 |
+
self._decode_per_level_predictions(
|
249 |
+
anchors_i,
|
250 |
+
box_cls_i,
|
251 |
+
box_reg_i,
|
252 |
+
self.test_score_thresh,
|
253 |
+
self.test_topk_candidates,
|
254 |
+
image_size,
|
255 |
+
)
|
256 |
+
# Iterate over every feature level
|
257 |
+
for box_cls_i, box_reg_i, anchors_i in zip(pred_scores, pred_deltas, anchors)
|
258 |
+
]
|
259 |
+
return predictions[0].cat(predictions) # 'Instances.cat' is not scriptale but this is
|
260 |
+
|
261 |
+
def visualize_training(self, batched_inputs, results):
|
262 |
+
"""
|
263 |
+
A function used to visualize ground truth images and final network predictions.
|
264 |
+
It shows ground truth bounding boxes on the original image and up to 20
|
265 |
+
predicted object bounding boxes on the original image.
|
266 |
+
|
267 |
+
Args:
|
268 |
+
batched_inputs (list): a list that contains input to the model.
|
269 |
+
results (List[Instances]): a list of #images elements returned by forward_inference().
|
270 |
+
"""
|
271 |
+
from annotator.oneformer.detectron2.utils.visualizer import Visualizer
|
272 |
+
|
273 |
+
assert len(batched_inputs) == len(
|
274 |
+
results
|
275 |
+
), "Cannot visualize inputs and results of different sizes"
|
276 |
+
storage = get_event_storage()
|
277 |
+
max_boxes = 20
|
278 |
+
|
279 |
+
image_index = 0 # only visualize a single image
|
280 |
+
img = batched_inputs[image_index]["image"]
|
281 |
+
img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format)
|
282 |
+
v_gt = Visualizer(img, None)
|
283 |
+
v_gt = v_gt.overlay_instances(boxes=batched_inputs[image_index]["instances"].gt_boxes)
|
284 |
+
anno_img = v_gt.get_image()
|
285 |
+
processed_results = detector_postprocess(results[image_index], img.shape[0], img.shape[1])
|
286 |
+
predicted_boxes = processed_results.pred_boxes.tensor.detach().cpu().numpy()
|
287 |
+
|
288 |
+
v_pred = Visualizer(img, None)
|
289 |
+
v_pred = v_pred.overlay_instances(boxes=predicted_boxes[0:max_boxes])
|
290 |
+
prop_img = v_pred.get_image()
|
291 |
+
vis_img = np.vstack((anno_img, prop_img))
|
292 |
+
vis_img = vis_img.transpose(2, 0, 1)
|
293 |
+
vis_name = f"Top: GT bounding boxes; Bottom: {max_boxes} Highest Scoring Results"
|
294 |
+
storage.put_image(vis_name, vis_img)
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/fcos.py
ADDED
@@ -0,0 +1,328 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
|
3 |
+
import logging
|
4 |
+
from typing import List, Optional, Tuple
|
5 |
+
import torch
|
6 |
+
from fvcore.nn import sigmoid_focal_loss_jit
|
7 |
+
from torch import nn
|
8 |
+
from torch.nn import functional as F
|
9 |
+
|
10 |
+
from annotator.oneformer.detectron2.layers import ShapeSpec, batched_nms
|
11 |
+
from annotator.oneformer.detectron2.structures import Boxes, ImageList, Instances, pairwise_point_box_distance
|
12 |
+
from annotator.oneformer.detectron2.utils.events import get_event_storage
|
13 |
+
|
14 |
+
from ..anchor_generator import DefaultAnchorGenerator
|
15 |
+
from ..backbone import Backbone
|
16 |
+
from ..box_regression import Box2BoxTransformLinear, _dense_box_regression_loss
|
17 |
+
from .dense_detector import DenseDetector
|
18 |
+
from .retinanet import RetinaNetHead
|
19 |
+
|
20 |
+
__all__ = ["FCOS"]
|
21 |
+
|
22 |
+
logger = logging.getLogger(__name__)
|
23 |
+
|
24 |
+
|
25 |
+
class FCOS(DenseDetector):
|
26 |
+
"""
|
27 |
+
Implement FCOS in :paper:`fcos`.
|
28 |
+
"""
|
29 |
+
|
30 |
+
def __init__(
|
31 |
+
self,
|
32 |
+
*,
|
33 |
+
backbone: Backbone,
|
34 |
+
head: nn.Module,
|
35 |
+
head_in_features: Optional[List[str]] = None,
|
36 |
+
box2box_transform=None,
|
37 |
+
num_classes,
|
38 |
+
center_sampling_radius: float = 1.5,
|
39 |
+
focal_loss_alpha=0.25,
|
40 |
+
focal_loss_gamma=2.0,
|
41 |
+
test_score_thresh=0.2,
|
42 |
+
test_topk_candidates=1000,
|
43 |
+
test_nms_thresh=0.6,
|
44 |
+
max_detections_per_image=100,
|
45 |
+
pixel_mean,
|
46 |
+
pixel_std,
|
47 |
+
):
|
48 |
+
"""
|
49 |
+
Args:
|
50 |
+
center_sampling_radius: radius of the "center" of a groundtruth box,
|
51 |
+
within which all anchor points are labeled positive.
|
52 |
+
Other arguments mean the same as in :class:`RetinaNet`.
|
53 |
+
"""
|
54 |
+
super().__init__(
|
55 |
+
backbone, head, head_in_features, pixel_mean=pixel_mean, pixel_std=pixel_std
|
56 |
+
)
|
57 |
+
|
58 |
+
self.num_classes = num_classes
|
59 |
+
|
60 |
+
# FCOS uses one anchor point per location.
|
61 |
+
# We represent the anchor point by a box whose size equals the anchor stride.
|
62 |
+
feature_shapes = backbone.output_shape()
|
63 |
+
fpn_strides = [feature_shapes[k].stride for k in self.head_in_features]
|
64 |
+
self.anchor_generator = DefaultAnchorGenerator(
|
65 |
+
sizes=[[k] for k in fpn_strides], aspect_ratios=[1.0], strides=fpn_strides
|
66 |
+
)
|
67 |
+
|
68 |
+
# FCOS parameterizes box regression by a linear transform,
|
69 |
+
# where predictions are normalized by anchor stride (equal to anchor size).
|
70 |
+
if box2box_transform is None:
|
71 |
+
box2box_transform = Box2BoxTransformLinear(normalize_by_size=True)
|
72 |
+
self.box2box_transform = box2box_transform
|
73 |
+
|
74 |
+
self.center_sampling_radius = float(center_sampling_radius)
|
75 |
+
|
76 |
+
# Loss parameters:
|
77 |
+
self.focal_loss_alpha = focal_loss_alpha
|
78 |
+
self.focal_loss_gamma = focal_loss_gamma
|
79 |
+
|
80 |
+
# Inference parameters:
|
81 |
+
self.test_score_thresh = test_score_thresh
|
82 |
+
self.test_topk_candidates = test_topk_candidates
|
83 |
+
self.test_nms_thresh = test_nms_thresh
|
84 |
+
self.max_detections_per_image = max_detections_per_image
|
85 |
+
|
86 |
+
def forward_training(self, images, features, predictions, gt_instances):
|
87 |
+
# Transpose the Hi*Wi*A dimension to the middle:
|
88 |
+
pred_logits, pred_anchor_deltas, pred_centerness = self._transpose_dense_predictions(
|
89 |
+
predictions, [self.num_classes, 4, 1]
|
90 |
+
)
|
91 |
+
anchors = self.anchor_generator(features)
|
92 |
+
gt_labels, gt_boxes = self.label_anchors(anchors, gt_instances)
|
93 |
+
return self.losses(
|
94 |
+
anchors, pred_logits, gt_labels, pred_anchor_deltas, gt_boxes, pred_centerness
|
95 |
+
)
|
96 |
+
|
97 |
+
@torch.no_grad()
|
98 |
+
def _match_anchors(self, gt_boxes: Boxes, anchors: List[Boxes]):
|
99 |
+
"""
|
100 |
+
Match ground-truth boxes to a set of multi-level anchors.
|
101 |
+
|
102 |
+
Args:
|
103 |
+
gt_boxes: Ground-truth boxes from instances of an image.
|
104 |
+
anchors: List of anchors for each feature map (of different scales).
|
105 |
+
|
106 |
+
Returns:
|
107 |
+
torch.Tensor
|
108 |
+
A tensor of shape `(M, R)`, given `M` ground-truth boxes and total
|
109 |
+
`R` anchor points from all feature levels, indicating the quality
|
110 |
+
of match between m-th box and r-th anchor. Higher value indicates
|
111 |
+
better match.
|
112 |
+
"""
|
113 |
+
# Naming convention: (M = ground-truth boxes, R = anchor points)
|
114 |
+
# Anchor points are represented as square boxes of size = stride.
|
115 |
+
num_anchors_per_level = [len(x) for x in anchors]
|
116 |
+
anchors = Boxes.cat(anchors) # (R, 4)
|
117 |
+
anchor_centers = anchors.get_centers() # (R, 2)
|
118 |
+
anchor_sizes = anchors.tensor[:, 2] - anchors.tensor[:, 0] # (R, )
|
119 |
+
|
120 |
+
lower_bound = anchor_sizes * 4
|
121 |
+
lower_bound[: num_anchors_per_level[0]] = 0
|
122 |
+
upper_bound = anchor_sizes * 8
|
123 |
+
upper_bound[-num_anchors_per_level[-1] :] = float("inf")
|
124 |
+
|
125 |
+
gt_centers = gt_boxes.get_centers()
|
126 |
+
|
127 |
+
# FCOS with center sampling: anchor point must be close enough to
|
128 |
+
# ground-truth box center.
|
129 |
+
center_dists = (anchor_centers[None, :, :] - gt_centers[:, None, :]).abs_()
|
130 |
+
sampling_regions = self.center_sampling_radius * anchor_sizes[None, :]
|
131 |
+
|
132 |
+
match_quality_matrix = center_dists.max(dim=2).values < sampling_regions
|
133 |
+
|
134 |
+
pairwise_dist = pairwise_point_box_distance(anchor_centers, gt_boxes)
|
135 |
+
pairwise_dist = pairwise_dist.permute(1, 0, 2) # (M, R, 4)
|
136 |
+
|
137 |
+
# The original FCOS anchor matching rule: anchor point must be inside GT.
|
138 |
+
match_quality_matrix &= pairwise_dist.min(dim=2).values > 0
|
139 |
+
|
140 |
+
# Multilevel anchor matching in FCOS: each anchor is only responsible
|
141 |
+
# for certain scale range.
|
142 |
+
pairwise_dist = pairwise_dist.max(dim=2).values
|
143 |
+
match_quality_matrix &= (pairwise_dist > lower_bound[None, :]) & (
|
144 |
+
pairwise_dist < upper_bound[None, :]
|
145 |
+
)
|
146 |
+
# Match the GT box with minimum area, if there are multiple GT matches.
|
147 |
+
gt_areas = gt_boxes.area() # (M, )
|
148 |
+
|
149 |
+
match_quality_matrix = match_quality_matrix.to(torch.float32)
|
150 |
+
match_quality_matrix *= 1e8 - gt_areas[:, None]
|
151 |
+
return match_quality_matrix # (M, R)
|
152 |
+
|
153 |
+
@torch.no_grad()
|
154 |
+
def label_anchors(self, anchors: List[Boxes], gt_instances: List[Instances]):
|
155 |
+
"""
|
156 |
+
Same interface as :meth:`RetinaNet.label_anchors`, but implemented with FCOS
|
157 |
+
anchor matching rule.
|
158 |
+
|
159 |
+
Unlike RetinaNet, there are no ignored anchors.
|
160 |
+
"""
|
161 |
+
|
162 |
+
gt_labels, matched_gt_boxes = [], []
|
163 |
+
|
164 |
+
for inst in gt_instances:
|
165 |
+
if len(inst) > 0:
|
166 |
+
match_quality_matrix = self._match_anchors(inst.gt_boxes, anchors)
|
167 |
+
|
168 |
+
# Find matched ground-truth box per anchor. Un-matched anchors are
|
169 |
+
# assigned -1. This is equivalent to using an anchor matcher as used
|
170 |
+
# in R-CNN/RetinaNet: `Matcher(thresholds=[1e-5], labels=[0, 1])`
|
171 |
+
match_quality, matched_idxs = match_quality_matrix.max(dim=0)
|
172 |
+
matched_idxs[match_quality < 1e-5] = -1
|
173 |
+
|
174 |
+
matched_gt_boxes_i = inst.gt_boxes.tensor[matched_idxs.clip(min=0)]
|
175 |
+
gt_labels_i = inst.gt_classes[matched_idxs.clip(min=0)]
|
176 |
+
|
177 |
+
# Anchors with matched_idxs = -1 are labeled background.
|
178 |
+
gt_labels_i[matched_idxs < 0] = self.num_classes
|
179 |
+
else:
|
180 |
+
matched_gt_boxes_i = torch.zeros_like(Boxes.cat(anchors).tensor)
|
181 |
+
gt_labels_i = torch.full(
|
182 |
+
(len(matched_gt_boxes_i),),
|
183 |
+
fill_value=self.num_classes,
|
184 |
+
dtype=torch.long,
|
185 |
+
device=matched_gt_boxes_i.device,
|
186 |
+
)
|
187 |
+
|
188 |
+
gt_labels.append(gt_labels_i)
|
189 |
+
matched_gt_boxes.append(matched_gt_boxes_i)
|
190 |
+
|
191 |
+
return gt_labels, matched_gt_boxes
|
192 |
+
|
193 |
+
def losses(
|
194 |
+
self, anchors, pred_logits, gt_labels, pred_anchor_deltas, gt_boxes, pred_centerness
|
195 |
+
):
|
196 |
+
"""
|
197 |
+
This method is almost identical to :meth:`RetinaNet.losses`, with an extra
|
198 |
+
"loss_centerness" in the returned dict.
|
199 |
+
"""
|
200 |
+
num_images = len(gt_labels)
|
201 |
+
gt_labels = torch.stack(gt_labels) # (M, R)
|
202 |
+
|
203 |
+
pos_mask = (gt_labels >= 0) & (gt_labels != self.num_classes)
|
204 |
+
num_pos_anchors = pos_mask.sum().item()
|
205 |
+
get_event_storage().put_scalar("num_pos_anchors", num_pos_anchors / num_images)
|
206 |
+
normalizer = self._ema_update("loss_normalizer", max(num_pos_anchors, 1), 300)
|
207 |
+
|
208 |
+
# classification and regression loss
|
209 |
+
gt_labels_target = F.one_hot(gt_labels, num_classes=self.num_classes + 1)[
|
210 |
+
:, :, :-1
|
211 |
+
] # no loss for the last (background) class
|
212 |
+
loss_cls = sigmoid_focal_loss_jit(
|
213 |
+
torch.cat(pred_logits, dim=1),
|
214 |
+
gt_labels_target.to(pred_logits[0].dtype),
|
215 |
+
alpha=self.focal_loss_alpha,
|
216 |
+
gamma=self.focal_loss_gamma,
|
217 |
+
reduction="sum",
|
218 |
+
)
|
219 |
+
|
220 |
+
loss_box_reg = _dense_box_regression_loss(
|
221 |
+
anchors,
|
222 |
+
self.box2box_transform,
|
223 |
+
pred_anchor_deltas,
|
224 |
+
gt_boxes,
|
225 |
+
pos_mask,
|
226 |
+
box_reg_loss_type="giou",
|
227 |
+
)
|
228 |
+
|
229 |
+
ctrness_targets = self.compute_ctrness_targets(anchors, gt_boxes) # (M, R)
|
230 |
+
pred_centerness = torch.cat(pred_centerness, dim=1).squeeze(dim=2) # (M, R)
|
231 |
+
ctrness_loss = F.binary_cross_entropy_with_logits(
|
232 |
+
pred_centerness[pos_mask], ctrness_targets[pos_mask], reduction="sum"
|
233 |
+
)
|
234 |
+
return {
|
235 |
+
"loss_fcos_cls": loss_cls / normalizer,
|
236 |
+
"loss_fcos_loc": loss_box_reg / normalizer,
|
237 |
+
"loss_fcos_ctr": ctrness_loss / normalizer,
|
238 |
+
}
|
239 |
+
|
240 |
+
def compute_ctrness_targets(self, anchors: List[Boxes], gt_boxes: List[torch.Tensor]):
|
241 |
+
anchors = Boxes.cat(anchors).tensor # Rx4
|
242 |
+
reg_targets = [self.box2box_transform.get_deltas(anchors, m) for m in gt_boxes]
|
243 |
+
reg_targets = torch.stack(reg_targets, dim=0) # NxRx4
|
244 |
+
if len(reg_targets) == 0:
|
245 |
+
return reg_targets.new_zeros(len(reg_targets))
|
246 |
+
left_right = reg_targets[:, :, [0, 2]]
|
247 |
+
top_bottom = reg_targets[:, :, [1, 3]]
|
248 |
+
ctrness = (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * (
|
249 |
+
top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0]
|
250 |
+
)
|
251 |
+
return torch.sqrt(ctrness)
|
252 |
+
|
253 |
+
def forward_inference(
|
254 |
+
self,
|
255 |
+
images: ImageList,
|
256 |
+
features: List[torch.Tensor],
|
257 |
+
predictions: List[List[torch.Tensor]],
|
258 |
+
):
|
259 |
+
pred_logits, pred_anchor_deltas, pred_centerness = self._transpose_dense_predictions(
|
260 |
+
predictions, [self.num_classes, 4, 1]
|
261 |
+
)
|
262 |
+
anchors = self.anchor_generator(features)
|
263 |
+
|
264 |
+
results: List[Instances] = []
|
265 |
+
for img_idx, image_size in enumerate(images.image_sizes):
|
266 |
+
scores_per_image = [
|
267 |
+
# Multiply and sqrt centerness & classification scores
|
268 |
+
# (See eqn. 4 in https://arxiv.org/abs/2006.09214)
|
269 |
+
torch.sqrt(x[img_idx].sigmoid_() * y[img_idx].sigmoid_())
|
270 |
+
for x, y in zip(pred_logits, pred_centerness)
|
271 |
+
]
|
272 |
+
deltas_per_image = [x[img_idx] for x in pred_anchor_deltas]
|
273 |
+
results_per_image = self.inference_single_image(
|
274 |
+
anchors, scores_per_image, deltas_per_image, image_size
|
275 |
+
)
|
276 |
+
results.append(results_per_image)
|
277 |
+
return results
|
278 |
+
|
279 |
+
def inference_single_image(
|
280 |
+
self,
|
281 |
+
anchors: List[Boxes],
|
282 |
+
box_cls: List[torch.Tensor],
|
283 |
+
box_delta: List[torch.Tensor],
|
284 |
+
image_size: Tuple[int, int],
|
285 |
+
):
|
286 |
+
"""
|
287 |
+
Identical to :meth:`RetinaNet.inference_single_image.
|
288 |
+
"""
|
289 |
+
pred = self._decode_multi_level_predictions(
|
290 |
+
anchors,
|
291 |
+
box_cls,
|
292 |
+
box_delta,
|
293 |
+
self.test_score_thresh,
|
294 |
+
self.test_topk_candidates,
|
295 |
+
image_size,
|
296 |
+
)
|
297 |
+
keep = batched_nms(
|
298 |
+
pred.pred_boxes.tensor, pred.scores, pred.pred_classes, self.test_nms_thresh
|
299 |
+
)
|
300 |
+
return pred[keep[: self.max_detections_per_image]]
|
301 |
+
|
302 |
+
|
303 |
+
class FCOSHead(RetinaNetHead):
|
304 |
+
"""
|
305 |
+
The head used in :paper:`fcos`. It adds an additional centerness
|
306 |
+
prediction branch on top of :class:`RetinaNetHead`.
|
307 |
+
"""
|
308 |
+
|
309 |
+
def __init__(self, *, input_shape: List[ShapeSpec], conv_dims: List[int], **kwargs):
|
310 |
+
super().__init__(input_shape=input_shape, conv_dims=conv_dims, num_anchors=1, **kwargs)
|
311 |
+
# Unlike original FCOS, we do not add an additional learnable scale layer
|
312 |
+
# because it's found to have no benefits after normalizing regression targets by stride.
|
313 |
+
self._num_features = len(input_shape)
|
314 |
+
self.ctrness = nn.Conv2d(conv_dims[-1], 1, kernel_size=3, stride=1, padding=1)
|
315 |
+
torch.nn.init.normal_(self.ctrness.weight, std=0.01)
|
316 |
+
torch.nn.init.constant_(self.ctrness.bias, 0)
|
317 |
+
|
318 |
+
def forward(self, features):
|
319 |
+
assert len(features) == self._num_features
|
320 |
+
logits = []
|
321 |
+
bbox_reg = []
|
322 |
+
ctrness = []
|
323 |
+
for feature in features:
|
324 |
+
logits.append(self.cls_score(self.cls_subnet(feature)))
|
325 |
+
bbox_feature = self.bbox_subnet(feature)
|
326 |
+
bbox_reg.append(self.bbox_pred(bbox_feature))
|
327 |
+
ctrness.append(self.ctrness(bbox_feature))
|
328 |
+
return logits, bbox_reg, ctrness
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/panoptic_fpn.py
ADDED
@@ -0,0 +1,269 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
3 |
+
|
4 |
+
import logging
|
5 |
+
from typing import Dict, List
|
6 |
+
import torch
|
7 |
+
from torch import nn
|
8 |
+
|
9 |
+
from annotator.oneformer.detectron2.config import configurable
|
10 |
+
from annotator.oneformer.detectron2.structures import ImageList
|
11 |
+
|
12 |
+
from ..postprocessing import detector_postprocess, sem_seg_postprocess
|
13 |
+
from .build import META_ARCH_REGISTRY
|
14 |
+
from .rcnn import GeneralizedRCNN
|
15 |
+
from .semantic_seg import build_sem_seg_head
|
16 |
+
|
17 |
+
__all__ = ["PanopticFPN"]
|
18 |
+
|
19 |
+
|
20 |
+
@META_ARCH_REGISTRY.register()
|
21 |
+
class PanopticFPN(GeneralizedRCNN):
|
22 |
+
"""
|
23 |
+
Implement the paper :paper:`PanopticFPN`.
|
24 |
+
"""
|
25 |
+
|
26 |
+
@configurable
|
27 |
+
def __init__(
|
28 |
+
self,
|
29 |
+
*,
|
30 |
+
sem_seg_head: nn.Module,
|
31 |
+
combine_overlap_thresh: float = 0.5,
|
32 |
+
combine_stuff_area_thresh: float = 4096,
|
33 |
+
combine_instances_score_thresh: float = 0.5,
|
34 |
+
**kwargs,
|
35 |
+
):
|
36 |
+
"""
|
37 |
+
NOTE: this interface is experimental.
|
38 |
+
|
39 |
+
Args:
|
40 |
+
sem_seg_head: a module for the semantic segmentation head.
|
41 |
+
combine_overlap_thresh: combine masks into one instances if
|
42 |
+
they have enough overlap
|
43 |
+
combine_stuff_area_thresh: ignore stuff areas smaller than this threshold
|
44 |
+
combine_instances_score_thresh: ignore instances whose score is
|
45 |
+
smaller than this threshold
|
46 |
+
|
47 |
+
Other arguments are the same as :class:`GeneralizedRCNN`.
|
48 |
+
"""
|
49 |
+
super().__init__(**kwargs)
|
50 |
+
self.sem_seg_head = sem_seg_head
|
51 |
+
# options when combining instance & semantic outputs
|
52 |
+
self.combine_overlap_thresh = combine_overlap_thresh
|
53 |
+
self.combine_stuff_area_thresh = combine_stuff_area_thresh
|
54 |
+
self.combine_instances_score_thresh = combine_instances_score_thresh
|
55 |
+
|
56 |
+
@classmethod
|
57 |
+
def from_config(cls, cfg):
|
58 |
+
ret = super().from_config(cfg)
|
59 |
+
ret.update(
|
60 |
+
{
|
61 |
+
"combine_overlap_thresh": cfg.MODEL.PANOPTIC_FPN.COMBINE.OVERLAP_THRESH,
|
62 |
+
"combine_stuff_area_thresh": cfg.MODEL.PANOPTIC_FPN.COMBINE.STUFF_AREA_LIMIT,
|
63 |
+
"combine_instances_score_thresh": cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH, # noqa
|
64 |
+
}
|
65 |
+
)
|
66 |
+
ret["sem_seg_head"] = build_sem_seg_head(cfg, ret["backbone"].output_shape())
|
67 |
+
logger = logging.getLogger(__name__)
|
68 |
+
if not cfg.MODEL.PANOPTIC_FPN.COMBINE.ENABLED:
|
69 |
+
logger.warning(
|
70 |
+
"PANOPTIC_FPN.COMBINED.ENABLED is no longer used. "
|
71 |
+
" model.inference(do_postprocess=) should be used to toggle postprocessing."
|
72 |
+
)
|
73 |
+
if cfg.MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT != 1.0:
|
74 |
+
w = cfg.MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT
|
75 |
+
logger.warning(
|
76 |
+
"PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT should be replaced by weights on each ROI head."
|
77 |
+
)
|
78 |
+
|
79 |
+
def update_weight(x):
|
80 |
+
if isinstance(x, dict):
|
81 |
+
return {k: v * w for k, v in x.items()}
|
82 |
+
else:
|
83 |
+
return x * w
|
84 |
+
|
85 |
+
roi_heads = ret["roi_heads"]
|
86 |
+
roi_heads.box_predictor.loss_weight = update_weight(roi_heads.box_predictor.loss_weight)
|
87 |
+
roi_heads.mask_head.loss_weight = update_weight(roi_heads.mask_head.loss_weight)
|
88 |
+
return ret
|
89 |
+
|
90 |
+
def forward(self, batched_inputs):
|
91 |
+
"""
|
92 |
+
Args:
|
93 |
+
batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
|
94 |
+
Each item in the list contains the inputs for one image.
|
95 |
+
|
96 |
+
For now, each item in the list is a dict that contains:
|
97 |
+
|
98 |
+
* "image": Tensor, image in (C, H, W) format.
|
99 |
+
* "instances": Instances
|
100 |
+
* "sem_seg": semantic segmentation ground truth.
|
101 |
+
* Other information that's included in the original dicts, such as:
|
102 |
+
"height", "width" (int): the output resolution of the model, used in inference.
|
103 |
+
See :meth:`postprocess` for details.
|
104 |
+
|
105 |
+
Returns:
|
106 |
+
list[dict]:
|
107 |
+
each dict has the results for one image. The dict contains the following keys:
|
108 |
+
|
109 |
+
* "instances": see :meth:`GeneralizedRCNN.forward` for its format.
|
110 |
+
* "sem_seg": see :meth:`SemanticSegmentor.forward` for its format.
|
111 |
+
* "panoptic_seg": See the return value of
|
112 |
+
:func:`combine_semantic_and_instance_outputs` for its format.
|
113 |
+
"""
|
114 |
+
if not self.training:
|
115 |
+
return self.inference(batched_inputs)
|
116 |
+
images = self.preprocess_image(batched_inputs)
|
117 |
+
features = self.backbone(images.tensor)
|
118 |
+
|
119 |
+
assert "sem_seg" in batched_inputs[0]
|
120 |
+
gt_sem_seg = [x["sem_seg"].to(self.device) for x in batched_inputs]
|
121 |
+
gt_sem_seg = ImageList.from_tensors(
|
122 |
+
gt_sem_seg,
|
123 |
+
self.backbone.size_divisibility,
|
124 |
+
self.sem_seg_head.ignore_value,
|
125 |
+
self.backbone.padding_constraints,
|
126 |
+
).tensor
|
127 |
+
sem_seg_results, sem_seg_losses = self.sem_seg_head(features, gt_sem_seg)
|
128 |
+
|
129 |
+
gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
|
130 |
+
proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
|
131 |
+
detector_results, detector_losses = self.roi_heads(
|
132 |
+
images, features, proposals, gt_instances
|
133 |
+
)
|
134 |
+
|
135 |
+
losses = sem_seg_losses
|
136 |
+
losses.update(proposal_losses)
|
137 |
+
losses.update(detector_losses)
|
138 |
+
return losses
|
139 |
+
|
140 |
+
def inference(self, batched_inputs: List[Dict[str, torch.Tensor]], do_postprocess: bool = True):
|
141 |
+
"""
|
142 |
+
Run inference on the given inputs.
|
143 |
+
|
144 |
+
Args:
|
145 |
+
batched_inputs (list[dict]): same as in :meth:`forward`
|
146 |
+
do_postprocess (bool): whether to apply post-processing on the outputs.
|
147 |
+
|
148 |
+
Returns:
|
149 |
+
When do_postprocess=True, see docs in :meth:`forward`.
|
150 |
+
Otherwise, returns a (list[Instances], list[Tensor]) that contains
|
151 |
+
the raw detector outputs, and raw semantic segmentation outputs.
|
152 |
+
"""
|
153 |
+
images = self.preprocess_image(batched_inputs)
|
154 |
+
features = self.backbone(images.tensor)
|
155 |
+
sem_seg_results, sem_seg_losses = self.sem_seg_head(features, None)
|
156 |
+
proposals, _ = self.proposal_generator(images, features, None)
|
157 |
+
detector_results, _ = self.roi_heads(images, features, proposals, None)
|
158 |
+
|
159 |
+
if do_postprocess:
|
160 |
+
processed_results = []
|
161 |
+
for sem_seg_result, detector_result, input_per_image, image_size in zip(
|
162 |
+
sem_seg_results, detector_results, batched_inputs, images.image_sizes
|
163 |
+
):
|
164 |
+
height = input_per_image.get("height", image_size[0])
|
165 |
+
width = input_per_image.get("width", image_size[1])
|
166 |
+
sem_seg_r = sem_seg_postprocess(sem_seg_result, image_size, height, width)
|
167 |
+
detector_r = detector_postprocess(detector_result, height, width)
|
168 |
+
|
169 |
+
processed_results.append({"sem_seg": sem_seg_r, "instances": detector_r})
|
170 |
+
|
171 |
+
panoptic_r = combine_semantic_and_instance_outputs(
|
172 |
+
detector_r,
|
173 |
+
sem_seg_r.argmax(dim=0),
|
174 |
+
self.combine_overlap_thresh,
|
175 |
+
self.combine_stuff_area_thresh,
|
176 |
+
self.combine_instances_score_thresh,
|
177 |
+
)
|
178 |
+
processed_results[-1]["panoptic_seg"] = panoptic_r
|
179 |
+
return processed_results
|
180 |
+
else:
|
181 |
+
return detector_results, sem_seg_results
|
182 |
+
|
183 |
+
|
184 |
+
def combine_semantic_and_instance_outputs(
|
185 |
+
instance_results,
|
186 |
+
semantic_results,
|
187 |
+
overlap_threshold,
|
188 |
+
stuff_area_thresh,
|
189 |
+
instances_score_thresh,
|
190 |
+
):
|
191 |
+
"""
|
192 |
+
Implement a simple combining logic following
|
193 |
+
"combine_semantic_and_instance_predictions.py" in panopticapi
|
194 |
+
to produce panoptic segmentation outputs.
|
195 |
+
|
196 |
+
Args:
|
197 |
+
instance_results: output of :func:`detector_postprocess`.
|
198 |
+
semantic_results: an (H, W) tensor, each element is the contiguous semantic
|
199 |
+
category id
|
200 |
+
|
201 |
+
Returns:
|
202 |
+
panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment.
|
203 |
+
segments_info (list[dict]): Describe each segment in `panoptic_seg`.
|
204 |
+
Each dict contains keys "id", "category_id", "isthing".
|
205 |
+
"""
|
206 |
+
panoptic_seg = torch.zeros_like(semantic_results, dtype=torch.int32)
|
207 |
+
|
208 |
+
# sort instance outputs by scores
|
209 |
+
sorted_inds = torch.argsort(-instance_results.scores)
|
210 |
+
|
211 |
+
current_segment_id = 0
|
212 |
+
segments_info = []
|
213 |
+
|
214 |
+
instance_masks = instance_results.pred_masks.to(dtype=torch.bool, device=panoptic_seg.device)
|
215 |
+
|
216 |
+
# Add instances one-by-one, check for overlaps with existing ones
|
217 |
+
for inst_id in sorted_inds:
|
218 |
+
score = instance_results.scores[inst_id].item()
|
219 |
+
if score < instances_score_thresh:
|
220 |
+
break
|
221 |
+
mask = instance_masks[inst_id] # H,W
|
222 |
+
mask_area = mask.sum().item()
|
223 |
+
|
224 |
+
if mask_area == 0:
|
225 |
+
continue
|
226 |
+
|
227 |
+
intersect = (mask > 0) & (panoptic_seg > 0)
|
228 |
+
intersect_area = intersect.sum().item()
|
229 |
+
|
230 |
+
if intersect_area * 1.0 / mask_area > overlap_threshold:
|
231 |
+
continue
|
232 |
+
|
233 |
+
if intersect_area > 0:
|
234 |
+
mask = mask & (panoptic_seg == 0)
|
235 |
+
|
236 |
+
current_segment_id += 1
|
237 |
+
panoptic_seg[mask] = current_segment_id
|
238 |
+
segments_info.append(
|
239 |
+
{
|
240 |
+
"id": current_segment_id,
|
241 |
+
"isthing": True,
|
242 |
+
"score": score,
|
243 |
+
"category_id": instance_results.pred_classes[inst_id].item(),
|
244 |
+
"instance_id": inst_id.item(),
|
245 |
+
}
|
246 |
+
)
|
247 |
+
|
248 |
+
# Add semantic results to remaining empty areas
|
249 |
+
semantic_labels = torch.unique(semantic_results).cpu().tolist()
|
250 |
+
for semantic_label in semantic_labels:
|
251 |
+
if semantic_label == 0: # 0 is a special "thing" class
|
252 |
+
continue
|
253 |
+
mask = (semantic_results == semantic_label) & (panoptic_seg == 0)
|
254 |
+
mask_area = mask.sum().item()
|
255 |
+
if mask_area < stuff_area_thresh:
|
256 |
+
continue
|
257 |
+
|
258 |
+
current_segment_id += 1
|
259 |
+
panoptic_seg[mask] = current_segment_id
|
260 |
+
segments_info.append(
|
261 |
+
{
|
262 |
+
"id": current_segment_id,
|
263 |
+
"isthing": False,
|
264 |
+
"category_id": semantic_label,
|
265 |
+
"area": mask_area,
|
266 |
+
}
|
267 |
+
)
|
268 |
+
|
269 |
+
return panoptic_seg, segments_info
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/rcnn.py
ADDED
@@ -0,0 +1,341 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
import logging
|
3 |
+
import numpy as np
|
4 |
+
from typing import Dict, List, Optional, Tuple
|
5 |
+
import torch
|
6 |
+
from torch import nn
|
7 |
+
|
8 |
+
from annotator.oneformer.detectron2.config import configurable
|
9 |
+
from annotator.oneformer.detectron2.data.detection_utils import convert_image_to_rgb
|
10 |
+
from annotator.oneformer.detectron2.layers import move_device_like
|
11 |
+
from annotator.oneformer.detectron2.structures import ImageList, Instances
|
12 |
+
from annotator.oneformer.detectron2.utils.events import get_event_storage
|
13 |
+
from annotator.oneformer.detectron2.utils.logger import log_first_n
|
14 |
+
|
15 |
+
from ..backbone import Backbone, build_backbone
|
16 |
+
from ..postprocessing import detector_postprocess
|
17 |
+
from ..proposal_generator import build_proposal_generator
|
18 |
+
from ..roi_heads import build_roi_heads
|
19 |
+
from .build import META_ARCH_REGISTRY
|
20 |
+
|
21 |
+
__all__ = ["GeneralizedRCNN", "ProposalNetwork"]
|
22 |
+
|
23 |
+
|
24 |
+
@META_ARCH_REGISTRY.register()
|
25 |
+
class GeneralizedRCNN(nn.Module):
|
26 |
+
"""
|
27 |
+
Generalized R-CNN. Any models that contains the following three components:
|
28 |
+
1. Per-image feature extraction (aka backbone)
|
29 |
+
2. Region proposal generation
|
30 |
+
3. Per-region feature extraction and prediction
|
31 |
+
"""
|
32 |
+
|
33 |
+
@configurable
|
34 |
+
def __init__(
|
35 |
+
self,
|
36 |
+
*,
|
37 |
+
backbone: Backbone,
|
38 |
+
proposal_generator: nn.Module,
|
39 |
+
roi_heads: nn.Module,
|
40 |
+
pixel_mean: Tuple[float],
|
41 |
+
pixel_std: Tuple[float],
|
42 |
+
input_format: Optional[str] = None,
|
43 |
+
vis_period: int = 0,
|
44 |
+
):
|
45 |
+
"""
|
46 |
+
Args:
|
47 |
+
backbone: a backbone module, must follow detectron2's backbone interface
|
48 |
+
proposal_generator: a module that generates proposals using backbone features
|
49 |
+
roi_heads: a ROI head that performs per-region computation
|
50 |
+
pixel_mean, pixel_std: list or tuple with #channels element, representing
|
51 |
+
the per-channel mean and std to be used to normalize the input image
|
52 |
+
input_format: describe the meaning of channels of input. Needed by visualization
|
53 |
+
vis_period: the period to run visualization. Set to 0 to disable.
|
54 |
+
"""
|
55 |
+
super().__init__()
|
56 |
+
self.backbone = backbone
|
57 |
+
self.proposal_generator = proposal_generator
|
58 |
+
self.roi_heads = roi_heads
|
59 |
+
|
60 |
+
self.input_format = input_format
|
61 |
+
self.vis_period = vis_period
|
62 |
+
if vis_period > 0:
|
63 |
+
assert input_format is not None, "input_format is required for visualization!"
|
64 |
+
|
65 |
+
self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
|
66 |
+
self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
|
67 |
+
assert (
|
68 |
+
self.pixel_mean.shape == self.pixel_std.shape
|
69 |
+
), f"{self.pixel_mean} and {self.pixel_std} have different shapes!"
|
70 |
+
|
71 |
+
@classmethod
|
72 |
+
def from_config(cls, cfg):
|
73 |
+
backbone = build_backbone(cfg)
|
74 |
+
return {
|
75 |
+
"backbone": backbone,
|
76 |
+
"proposal_generator": build_proposal_generator(cfg, backbone.output_shape()),
|
77 |
+
"roi_heads": build_roi_heads(cfg, backbone.output_shape()),
|
78 |
+
"input_format": cfg.INPUT.FORMAT,
|
79 |
+
"vis_period": cfg.VIS_PERIOD,
|
80 |
+
"pixel_mean": cfg.MODEL.PIXEL_MEAN,
|
81 |
+
"pixel_std": cfg.MODEL.PIXEL_STD,
|
82 |
+
}
|
83 |
+
|
84 |
+
@property
|
85 |
+
def device(self):
|
86 |
+
return self.pixel_mean.device
|
87 |
+
|
88 |
+
def _move_to_current_device(self, x):
|
89 |
+
return move_device_like(x, self.pixel_mean)
|
90 |
+
|
91 |
+
def visualize_training(self, batched_inputs, proposals):
|
92 |
+
"""
|
93 |
+
A function used to visualize images and proposals. It shows ground truth
|
94 |
+
bounding boxes on the original image and up to 20 top-scoring predicted
|
95 |
+
object proposals on the original image. Users can implement different
|
96 |
+
visualization functions for different models.
|
97 |
+
|
98 |
+
Args:
|
99 |
+
batched_inputs (list): a list that contains input to the model.
|
100 |
+
proposals (list): a list that contains predicted proposals. Both
|
101 |
+
batched_inputs and proposals should have the same length.
|
102 |
+
"""
|
103 |
+
from annotator.oneformer.detectron2.utils.visualizer import Visualizer
|
104 |
+
|
105 |
+
storage = get_event_storage()
|
106 |
+
max_vis_prop = 20
|
107 |
+
|
108 |
+
for input, prop in zip(batched_inputs, proposals):
|
109 |
+
img = input["image"]
|
110 |
+
img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format)
|
111 |
+
v_gt = Visualizer(img, None)
|
112 |
+
v_gt = v_gt.overlay_instances(boxes=input["instances"].gt_boxes)
|
113 |
+
anno_img = v_gt.get_image()
|
114 |
+
box_size = min(len(prop.proposal_boxes), max_vis_prop)
|
115 |
+
v_pred = Visualizer(img, None)
|
116 |
+
v_pred = v_pred.overlay_instances(
|
117 |
+
boxes=prop.proposal_boxes[0:box_size].tensor.cpu().numpy()
|
118 |
+
)
|
119 |
+
prop_img = v_pred.get_image()
|
120 |
+
vis_img = np.concatenate((anno_img, prop_img), axis=1)
|
121 |
+
vis_img = vis_img.transpose(2, 0, 1)
|
122 |
+
vis_name = "Left: GT bounding boxes; Right: Predicted proposals"
|
123 |
+
storage.put_image(vis_name, vis_img)
|
124 |
+
break # only visualize one image in a batch
|
125 |
+
|
126 |
+
def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]):
|
127 |
+
"""
|
128 |
+
Args:
|
129 |
+
batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
|
130 |
+
Each item in the list contains the inputs for one image.
|
131 |
+
For now, each item in the list is a dict that contains:
|
132 |
+
|
133 |
+
* image: Tensor, image in (C, H, W) format.
|
134 |
+
* instances (optional): groundtruth :class:`Instances`
|
135 |
+
* proposals (optional): :class:`Instances`, precomputed proposals.
|
136 |
+
|
137 |
+
Other information that's included in the original dicts, such as:
|
138 |
+
|
139 |
+
* "height", "width" (int): the output resolution of the model, used in inference.
|
140 |
+
See :meth:`postprocess` for details.
|
141 |
+
|
142 |
+
Returns:
|
143 |
+
list[dict]:
|
144 |
+
Each dict is the output for one input image.
|
145 |
+
The dict contains one key "instances" whose value is a :class:`Instances`.
|
146 |
+
The :class:`Instances` object has the following keys:
|
147 |
+
"pred_boxes", "pred_classes", "scores", "pred_masks", "pred_keypoints"
|
148 |
+
"""
|
149 |
+
if not self.training:
|
150 |
+
return self.inference(batched_inputs)
|
151 |
+
|
152 |
+
images = self.preprocess_image(batched_inputs)
|
153 |
+
if "instances" in batched_inputs[0]:
|
154 |
+
gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
|
155 |
+
else:
|
156 |
+
gt_instances = None
|
157 |
+
|
158 |
+
features = self.backbone(images.tensor)
|
159 |
+
|
160 |
+
if self.proposal_generator is not None:
|
161 |
+
proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
|
162 |
+
else:
|
163 |
+
assert "proposals" in batched_inputs[0]
|
164 |
+
proposals = [x["proposals"].to(self.device) for x in batched_inputs]
|
165 |
+
proposal_losses = {}
|
166 |
+
|
167 |
+
_, detector_losses = self.roi_heads(images, features, proposals, gt_instances)
|
168 |
+
if self.vis_period > 0:
|
169 |
+
storage = get_event_storage()
|
170 |
+
if storage.iter % self.vis_period == 0:
|
171 |
+
self.visualize_training(batched_inputs, proposals)
|
172 |
+
|
173 |
+
losses = {}
|
174 |
+
losses.update(detector_losses)
|
175 |
+
losses.update(proposal_losses)
|
176 |
+
return losses
|
177 |
+
|
178 |
+
def inference(
|
179 |
+
self,
|
180 |
+
batched_inputs: List[Dict[str, torch.Tensor]],
|
181 |
+
detected_instances: Optional[List[Instances]] = None,
|
182 |
+
do_postprocess: bool = True,
|
183 |
+
):
|
184 |
+
"""
|
185 |
+
Run inference on the given inputs.
|
186 |
+
|
187 |
+
Args:
|
188 |
+
batched_inputs (list[dict]): same as in :meth:`forward`
|
189 |
+
detected_instances (None or list[Instances]): if not None, it
|
190 |
+
contains an `Instances` object per image. The `Instances`
|
191 |
+
object contains "pred_boxes" and "pred_classes" which are
|
192 |
+
known boxes in the image.
|
193 |
+
The inference will then skip the detection of bounding boxes,
|
194 |
+
and only predict other per-ROI outputs.
|
195 |
+
do_postprocess (bool): whether to apply post-processing on the outputs.
|
196 |
+
|
197 |
+
Returns:
|
198 |
+
When do_postprocess=True, same as in :meth:`forward`.
|
199 |
+
Otherwise, a list[Instances] containing raw network outputs.
|
200 |
+
"""
|
201 |
+
assert not self.training
|
202 |
+
|
203 |
+
images = self.preprocess_image(batched_inputs)
|
204 |
+
features = self.backbone(images.tensor)
|
205 |
+
|
206 |
+
if detected_instances is None:
|
207 |
+
if self.proposal_generator is not None:
|
208 |
+
proposals, _ = self.proposal_generator(images, features, None)
|
209 |
+
else:
|
210 |
+
assert "proposals" in batched_inputs[0]
|
211 |
+
proposals = [x["proposals"].to(self.device) for x in batched_inputs]
|
212 |
+
|
213 |
+
results, _ = self.roi_heads(images, features, proposals, None)
|
214 |
+
else:
|
215 |
+
detected_instances = [x.to(self.device) for x in detected_instances]
|
216 |
+
results = self.roi_heads.forward_with_given_boxes(features, detected_instances)
|
217 |
+
|
218 |
+
if do_postprocess:
|
219 |
+
assert not torch.jit.is_scripting(), "Scripting is not supported for postprocess."
|
220 |
+
return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes)
|
221 |
+
return results
|
222 |
+
|
223 |
+
def preprocess_image(self, batched_inputs: List[Dict[str, torch.Tensor]]):
|
224 |
+
"""
|
225 |
+
Normalize, pad and batch the input images.
|
226 |
+
"""
|
227 |
+
images = [self._move_to_current_device(x["image"]) for x in batched_inputs]
|
228 |
+
images = [(x - self.pixel_mean) / self.pixel_std for x in images]
|
229 |
+
images = ImageList.from_tensors(
|
230 |
+
images,
|
231 |
+
self.backbone.size_divisibility,
|
232 |
+
padding_constraints=self.backbone.padding_constraints,
|
233 |
+
)
|
234 |
+
return images
|
235 |
+
|
236 |
+
@staticmethod
|
237 |
+
def _postprocess(instances, batched_inputs: List[Dict[str, torch.Tensor]], image_sizes):
|
238 |
+
"""
|
239 |
+
Rescale the output instances to the target size.
|
240 |
+
"""
|
241 |
+
# note: private function; subject to changes
|
242 |
+
processed_results = []
|
243 |
+
for results_per_image, input_per_image, image_size in zip(
|
244 |
+
instances, batched_inputs, image_sizes
|
245 |
+
):
|
246 |
+
height = input_per_image.get("height", image_size[0])
|
247 |
+
width = input_per_image.get("width", image_size[1])
|
248 |
+
r = detector_postprocess(results_per_image, height, width)
|
249 |
+
processed_results.append({"instances": r})
|
250 |
+
return processed_results
|
251 |
+
|
252 |
+
|
253 |
+
@META_ARCH_REGISTRY.register()
|
254 |
+
class ProposalNetwork(nn.Module):
|
255 |
+
"""
|
256 |
+
A meta architecture that only predicts object proposals.
|
257 |
+
"""
|
258 |
+
|
259 |
+
@configurable
|
260 |
+
def __init__(
|
261 |
+
self,
|
262 |
+
*,
|
263 |
+
backbone: Backbone,
|
264 |
+
proposal_generator: nn.Module,
|
265 |
+
pixel_mean: Tuple[float],
|
266 |
+
pixel_std: Tuple[float],
|
267 |
+
):
|
268 |
+
"""
|
269 |
+
Args:
|
270 |
+
backbone: a backbone module, must follow detectron2's backbone interface
|
271 |
+
proposal_generator: a module that generates proposals using backbone features
|
272 |
+
pixel_mean, pixel_std: list or tuple with #channels element, representing
|
273 |
+
the per-channel mean and std to be used to normalize the input image
|
274 |
+
"""
|
275 |
+
super().__init__()
|
276 |
+
self.backbone = backbone
|
277 |
+
self.proposal_generator = proposal_generator
|
278 |
+
self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
|
279 |
+
self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
|
280 |
+
|
281 |
+
@classmethod
|
282 |
+
def from_config(cls, cfg):
|
283 |
+
backbone = build_backbone(cfg)
|
284 |
+
return {
|
285 |
+
"backbone": backbone,
|
286 |
+
"proposal_generator": build_proposal_generator(cfg, backbone.output_shape()),
|
287 |
+
"pixel_mean": cfg.MODEL.PIXEL_MEAN,
|
288 |
+
"pixel_std": cfg.MODEL.PIXEL_STD,
|
289 |
+
}
|
290 |
+
|
291 |
+
@property
|
292 |
+
def device(self):
|
293 |
+
return self.pixel_mean.device
|
294 |
+
|
295 |
+
def _move_to_current_device(self, x):
|
296 |
+
return move_device_like(x, self.pixel_mean)
|
297 |
+
|
298 |
+
def forward(self, batched_inputs):
|
299 |
+
"""
|
300 |
+
Args:
|
301 |
+
Same as in :class:`GeneralizedRCNN.forward`
|
302 |
+
|
303 |
+
Returns:
|
304 |
+
list[dict]:
|
305 |
+
Each dict is the output for one input image.
|
306 |
+
The dict contains one key "proposals" whose value is a
|
307 |
+
:class:`Instances` with keys "proposal_boxes" and "objectness_logits".
|
308 |
+
"""
|
309 |
+
images = [self._move_to_current_device(x["image"]) for x in batched_inputs]
|
310 |
+
images = [(x - self.pixel_mean) / self.pixel_std for x in images]
|
311 |
+
images = ImageList.from_tensors(
|
312 |
+
images,
|
313 |
+
self.backbone.size_divisibility,
|
314 |
+
padding_constraints=self.backbone.padding_constraints,
|
315 |
+
)
|
316 |
+
features = self.backbone(images.tensor)
|
317 |
+
|
318 |
+
if "instances" in batched_inputs[0]:
|
319 |
+
gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
|
320 |
+
elif "targets" in batched_inputs[0]:
|
321 |
+
log_first_n(
|
322 |
+
logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10
|
323 |
+
)
|
324 |
+
gt_instances = [x["targets"].to(self.device) for x in batched_inputs]
|
325 |
+
else:
|
326 |
+
gt_instances = None
|
327 |
+
proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
|
328 |
+
# In training, the proposals are not useful at all but we generate them anyway.
|
329 |
+
# This makes RPN-only models about 5% slower.
|
330 |
+
if self.training:
|
331 |
+
return proposal_losses
|
332 |
+
|
333 |
+
processed_results = []
|
334 |
+
for results_per_image, input_per_image, image_size in zip(
|
335 |
+
proposals, batched_inputs, images.image_sizes
|
336 |
+
):
|
337 |
+
height = input_per_image.get("height", image_size[0])
|
338 |
+
width = input_per_image.get("width", image_size[1])
|
339 |
+
r = detector_postprocess(results_per_image, height, width)
|
340 |
+
processed_results.append({"proposals": r})
|
341 |
+
return processed_results
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/retinanet.py
ADDED
@@ -0,0 +1,439 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
import logging
|
3 |
+
import math
|
4 |
+
from typing import List, Tuple
|
5 |
+
import torch
|
6 |
+
from fvcore.nn import sigmoid_focal_loss_jit
|
7 |
+
from torch import Tensor, nn
|
8 |
+
from torch.nn import functional as F
|
9 |
+
|
10 |
+
from annotator.oneformer.detectron2.config import configurable
|
11 |
+
from annotator.oneformer.detectron2.layers import CycleBatchNormList, ShapeSpec, batched_nms, cat, get_norm
|
12 |
+
from annotator.oneformer.detectron2.structures import Boxes, ImageList, Instances, pairwise_iou
|
13 |
+
from annotator.oneformer.detectron2.utils.events import get_event_storage
|
14 |
+
|
15 |
+
from ..anchor_generator import build_anchor_generator
|
16 |
+
from ..backbone import Backbone, build_backbone
|
17 |
+
from ..box_regression import Box2BoxTransform, _dense_box_regression_loss
|
18 |
+
from ..matcher import Matcher
|
19 |
+
from .build import META_ARCH_REGISTRY
|
20 |
+
from .dense_detector import DenseDetector, permute_to_N_HWA_K # noqa
|
21 |
+
|
22 |
+
__all__ = ["RetinaNet"]
|
23 |
+
|
24 |
+
|
25 |
+
logger = logging.getLogger(__name__)
|
26 |
+
|
27 |
+
|
28 |
+
@META_ARCH_REGISTRY.register()
|
29 |
+
class RetinaNet(DenseDetector):
|
30 |
+
"""
|
31 |
+
Implement RetinaNet in :paper:`RetinaNet`.
|
32 |
+
"""
|
33 |
+
|
34 |
+
@configurable
|
35 |
+
def __init__(
|
36 |
+
self,
|
37 |
+
*,
|
38 |
+
backbone: Backbone,
|
39 |
+
head: nn.Module,
|
40 |
+
head_in_features,
|
41 |
+
anchor_generator,
|
42 |
+
box2box_transform,
|
43 |
+
anchor_matcher,
|
44 |
+
num_classes,
|
45 |
+
focal_loss_alpha=0.25,
|
46 |
+
focal_loss_gamma=2.0,
|
47 |
+
smooth_l1_beta=0.0,
|
48 |
+
box_reg_loss_type="smooth_l1",
|
49 |
+
test_score_thresh=0.05,
|
50 |
+
test_topk_candidates=1000,
|
51 |
+
test_nms_thresh=0.5,
|
52 |
+
max_detections_per_image=100,
|
53 |
+
pixel_mean,
|
54 |
+
pixel_std,
|
55 |
+
vis_period=0,
|
56 |
+
input_format="BGR",
|
57 |
+
):
|
58 |
+
"""
|
59 |
+
NOTE: this interface is experimental.
|
60 |
+
|
61 |
+
Args:
|
62 |
+
backbone: a backbone module, must follow detectron2's backbone interface
|
63 |
+
head (nn.Module): a module that predicts logits and regression deltas
|
64 |
+
for each level from a list of per-level features
|
65 |
+
head_in_features (Tuple[str]): Names of the input feature maps to be used in head
|
66 |
+
anchor_generator (nn.Module): a module that creates anchors from a
|
67 |
+
list of features. Usually an instance of :class:`AnchorGenerator`
|
68 |
+
box2box_transform (Box2BoxTransform): defines the transform from anchors boxes to
|
69 |
+
instance boxes
|
70 |
+
anchor_matcher (Matcher): label the anchors by matching them with ground truth.
|
71 |
+
num_classes (int): number of classes. Used to label background proposals.
|
72 |
+
|
73 |
+
# Loss parameters:
|
74 |
+
focal_loss_alpha (float): focal_loss_alpha
|
75 |
+
focal_loss_gamma (float): focal_loss_gamma
|
76 |
+
smooth_l1_beta (float): smooth_l1_beta
|
77 |
+
box_reg_loss_type (str): Options are "smooth_l1", "giou", "diou", "ciou"
|
78 |
+
|
79 |
+
# Inference parameters:
|
80 |
+
test_score_thresh (float): Inference cls score threshold, only anchors with
|
81 |
+
score > INFERENCE_TH are considered for inference (to improve speed)
|
82 |
+
test_topk_candidates (int): Select topk candidates before NMS
|
83 |
+
test_nms_thresh (float): Overlap threshold used for non-maximum suppression
|
84 |
+
(suppress boxes with IoU >= this threshold)
|
85 |
+
max_detections_per_image (int):
|
86 |
+
Maximum number of detections to return per image during inference
|
87 |
+
(100 is based on the limit established for the COCO dataset).
|
88 |
+
|
89 |
+
pixel_mean, pixel_std: see :class:`DenseDetector`.
|
90 |
+
"""
|
91 |
+
super().__init__(
|
92 |
+
backbone, head, head_in_features, pixel_mean=pixel_mean, pixel_std=pixel_std
|
93 |
+
)
|
94 |
+
self.num_classes = num_classes
|
95 |
+
|
96 |
+
# Anchors
|
97 |
+
self.anchor_generator = anchor_generator
|
98 |
+
self.box2box_transform = box2box_transform
|
99 |
+
self.anchor_matcher = anchor_matcher
|
100 |
+
|
101 |
+
# Loss parameters:
|
102 |
+
self.focal_loss_alpha = focal_loss_alpha
|
103 |
+
self.focal_loss_gamma = focal_loss_gamma
|
104 |
+
self.smooth_l1_beta = smooth_l1_beta
|
105 |
+
self.box_reg_loss_type = box_reg_loss_type
|
106 |
+
# Inference parameters:
|
107 |
+
self.test_score_thresh = test_score_thresh
|
108 |
+
self.test_topk_candidates = test_topk_candidates
|
109 |
+
self.test_nms_thresh = test_nms_thresh
|
110 |
+
self.max_detections_per_image = max_detections_per_image
|
111 |
+
# Vis parameters
|
112 |
+
self.vis_period = vis_period
|
113 |
+
self.input_format = input_format
|
114 |
+
|
115 |
+
@classmethod
|
116 |
+
def from_config(cls, cfg):
|
117 |
+
backbone = build_backbone(cfg)
|
118 |
+
backbone_shape = backbone.output_shape()
|
119 |
+
feature_shapes = [backbone_shape[f] for f in cfg.MODEL.RETINANET.IN_FEATURES]
|
120 |
+
head = RetinaNetHead(cfg, feature_shapes)
|
121 |
+
anchor_generator = build_anchor_generator(cfg, feature_shapes)
|
122 |
+
return {
|
123 |
+
"backbone": backbone,
|
124 |
+
"head": head,
|
125 |
+
"anchor_generator": anchor_generator,
|
126 |
+
"box2box_transform": Box2BoxTransform(weights=cfg.MODEL.RETINANET.BBOX_REG_WEIGHTS),
|
127 |
+
"anchor_matcher": Matcher(
|
128 |
+
cfg.MODEL.RETINANET.IOU_THRESHOLDS,
|
129 |
+
cfg.MODEL.RETINANET.IOU_LABELS,
|
130 |
+
allow_low_quality_matches=True,
|
131 |
+
),
|
132 |
+
"pixel_mean": cfg.MODEL.PIXEL_MEAN,
|
133 |
+
"pixel_std": cfg.MODEL.PIXEL_STD,
|
134 |
+
"num_classes": cfg.MODEL.RETINANET.NUM_CLASSES,
|
135 |
+
"head_in_features": cfg.MODEL.RETINANET.IN_FEATURES,
|
136 |
+
# Loss parameters:
|
137 |
+
"focal_loss_alpha": cfg.MODEL.RETINANET.FOCAL_LOSS_ALPHA,
|
138 |
+
"focal_loss_gamma": cfg.MODEL.RETINANET.FOCAL_LOSS_GAMMA,
|
139 |
+
"smooth_l1_beta": cfg.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA,
|
140 |
+
"box_reg_loss_type": cfg.MODEL.RETINANET.BBOX_REG_LOSS_TYPE,
|
141 |
+
# Inference parameters:
|
142 |
+
"test_score_thresh": cfg.MODEL.RETINANET.SCORE_THRESH_TEST,
|
143 |
+
"test_topk_candidates": cfg.MODEL.RETINANET.TOPK_CANDIDATES_TEST,
|
144 |
+
"test_nms_thresh": cfg.MODEL.RETINANET.NMS_THRESH_TEST,
|
145 |
+
"max_detections_per_image": cfg.TEST.DETECTIONS_PER_IMAGE,
|
146 |
+
# Vis parameters
|
147 |
+
"vis_period": cfg.VIS_PERIOD,
|
148 |
+
"input_format": cfg.INPUT.FORMAT,
|
149 |
+
}
|
150 |
+
|
151 |
+
def forward_training(self, images, features, predictions, gt_instances):
|
152 |
+
# Transpose the Hi*Wi*A dimension to the middle:
|
153 |
+
pred_logits, pred_anchor_deltas = self._transpose_dense_predictions(
|
154 |
+
predictions, [self.num_classes, 4]
|
155 |
+
)
|
156 |
+
anchors = self.anchor_generator(features)
|
157 |
+
gt_labels, gt_boxes = self.label_anchors(anchors, gt_instances)
|
158 |
+
return self.losses(anchors, pred_logits, gt_labels, pred_anchor_deltas, gt_boxes)
|
159 |
+
|
160 |
+
def losses(self, anchors, pred_logits, gt_labels, pred_anchor_deltas, gt_boxes):
|
161 |
+
"""
|
162 |
+
Args:
|
163 |
+
anchors (list[Boxes]): a list of #feature level Boxes
|
164 |
+
gt_labels, gt_boxes: see output of :meth:`RetinaNet.label_anchors`.
|
165 |
+
Their shapes are (N, R) and (N, R, 4), respectively, where R is
|
166 |
+
the total number of anchors across levels, i.e. sum(Hi x Wi x Ai)
|
167 |
+
pred_logits, pred_anchor_deltas: both are list[Tensor]. Each element in the
|
168 |
+
list corresponds to one level and has shape (N, Hi * Wi * Ai, K or 4).
|
169 |
+
Where K is the number of classes used in `pred_logits`.
|
170 |
+
|
171 |
+
Returns:
|
172 |
+
dict[str, Tensor]:
|
173 |
+
mapping from a named loss to a scalar tensor storing the loss.
|
174 |
+
Used during training only. The dict keys are: "loss_cls" and "loss_box_reg"
|
175 |
+
"""
|
176 |
+
num_images = len(gt_labels)
|
177 |
+
gt_labels = torch.stack(gt_labels) # (N, R)
|
178 |
+
|
179 |
+
valid_mask = gt_labels >= 0
|
180 |
+
pos_mask = (gt_labels >= 0) & (gt_labels != self.num_classes)
|
181 |
+
num_pos_anchors = pos_mask.sum().item()
|
182 |
+
get_event_storage().put_scalar("num_pos_anchors", num_pos_anchors / num_images)
|
183 |
+
normalizer = self._ema_update("loss_normalizer", max(num_pos_anchors, 1), 100)
|
184 |
+
|
185 |
+
# classification and regression loss
|
186 |
+
gt_labels_target = F.one_hot(gt_labels[valid_mask], num_classes=self.num_classes + 1)[
|
187 |
+
:, :-1
|
188 |
+
] # no loss for the last (background) class
|
189 |
+
loss_cls = sigmoid_focal_loss_jit(
|
190 |
+
cat(pred_logits, dim=1)[valid_mask],
|
191 |
+
gt_labels_target.to(pred_logits[0].dtype),
|
192 |
+
alpha=self.focal_loss_alpha,
|
193 |
+
gamma=self.focal_loss_gamma,
|
194 |
+
reduction="sum",
|
195 |
+
)
|
196 |
+
|
197 |
+
loss_box_reg = _dense_box_regression_loss(
|
198 |
+
anchors,
|
199 |
+
self.box2box_transform,
|
200 |
+
pred_anchor_deltas,
|
201 |
+
gt_boxes,
|
202 |
+
pos_mask,
|
203 |
+
box_reg_loss_type=self.box_reg_loss_type,
|
204 |
+
smooth_l1_beta=self.smooth_l1_beta,
|
205 |
+
)
|
206 |
+
|
207 |
+
return {
|
208 |
+
"loss_cls": loss_cls / normalizer,
|
209 |
+
"loss_box_reg": loss_box_reg / normalizer,
|
210 |
+
}
|
211 |
+
|
212 |
+
@torch.no_grad()
|
213 |
+
def label_anchors(self, anchors, gt_instances):
|
214 |
+
"""
|
215 |
+
Args:
|
216 |
+
anchors (list[Boxes]): A list of #feature level Boxes.
|
217 |
+
The Boxes contains anchors of this image on the specific feature level.
|
218 |
+
gt_instances (list[Instances]): a list of N `Instances`s. The i-th
|
219 |
+
`Instances` contains the ground-truth per-instance annotations
|
220 |
+
for the i-th input image.
|
221 |
+
|
222 |
+
Returns:
|
223 |
+
list[Tensor]: List of #img tensors. i-th element is a vector of labels whose length is
|
224 |
+
the total number of anchors across all feature maps (sum(Hi * Wi * A)).
|
225 |
+
Label values are in {-1, 0, ..., K}, with -1 means ignore, and K means background.
|
226 |
+
|
227 |
+
list[Tensor]: i-th element is a Rx4 tensor, where R is the total number of anchors
|
228 |
+
across feature maps. The values are the matched gt boxes for each anchor.
|
229 |
+
Values are undefined for those anchors not labeled as foreground.
|
230 |
+
"""
|
231 |
+
anchors = Boxes.cat(anchors) # Rx4
|
232 |
+
|
233 |
+
gt_labels = []
|
234 |
+
matched_gt_boxes = []
|
235 |
+
for gt_per_image in gt_instances:
|
236 |
+
match_quality_matrix = pairwise_iou(gt_per_image.gt_boxes, anchors)
|
237 |
+
matched_idxs, anchor_labels = self.anchor_matcher(match_quality_matrix)
|
238 |
+
del match_quality_matrix
|
239 |
+
|
240 |
+
if len(gt_per_image) > 0:
|
241 |
+
matched_gt_boxes_i = gt_per_image.gt_boxes.tensor[matched_idxs]
|
242 |
+
|
243 |
+
gt_labels_i = gt_per_image.gt_classes[matched_idxs]
|
244 |
+
# Anchors with label 0 are treated as background.
|
245 |
+
gt_labels_i[anchor_labels == 0] = self.num_classes
|
246 |
+
# Anchors with label -1 are ignored.
|
247 |
+
gt_labels_i[anchor_labels == -1] = -1
|
248 |
+
else:
|
249 |
+
matched_gt_boxes_i = torch.zeros_like(anchors.tensor)
|
250 |
+
gt_labels_i = torch.zeros_like(matched_idxs) + self.num_classes
|
251 |
+
|
252 |
+
gt_labels.append(gt_labels_i)
|
253 |
+
matched_gt_boxes.append(matched_gt_boxes_i)
|
254 |
+
|
255 |
+
return gt_labels, matched_gt_boxes
|
256 |
+
|
257 |
+
def forward_inference(
|
258 |
+
self, images: ImageList, features: List[Tensor], predictions: List[List[Tensor]]
|
259 |
+
):
|
260 |
+
pred_logits, pred_anchor_deltas = self._transpose_dense_predictions(
|
261 |
+
predictions, [self.num_classes, 4]
|
262 |
+
)
|
263 |
+
anchors = self.anchor_generator(features)
|
264 |
+
|
265 |
+
results: List[Instances] = []
|
266 |
+
for img_idx, image_size in enumerate(images.image_sizes):
|
267 |
+
scores_per_image = [x[img_idx].sigmoid_() for x in pred_logits]
|
268 |
+
deltas_per_image = [x[img_idx] for x in pred_anchor_deltas]
|
269 |
+
results_per_image = self.inference_single_image(
|
270 |
+
anchors, scores_per_image, deltas_per_image, image_size
|
271 |
+
)
|
272 |
+
results.append(results_per_image)
|
273 |
+
return results
|
274 |
+
|
275 |
+
def inference_single_image(
|
276 |
+
self,
|
277 |
+
anchors: List[Boxes],
|
278 |
+
box_cls: List[Tensor],
|
279 |
+
box_delta: List[Tensor],
|
280 |
+
image_size: Tuple[int, int],
|
281 |
+
):
|
282 |
+
"""
|
283 |
+
Single-image inference. Return bounding-box detection results by thresholding
|
284 |
+
on scores and applying non-maximum suppression (NMS).
|
285 |
+
|
286 |
+
Arguments:
|
287 |
+
anchors (list[Boxes]): list of #feature levels. Each entry contains
|
288 |
+
a Boxes object, which contains all the anchors in that feature level.
|
289 |
+
box_cls (list[Tensor]): list of #feature levels. Each entry contains
|
290 |
+
tensor of size (H x W x A, K)
|
291 |
+
box_delta (list[Tensor]): Same shape as 'box_cls' except that K becomes 4.
|
292 |
+
image_size (tuple(H, W)): a tuple of the image height and width.
|
293 |
+
|
294 |
+
Returns:
|
295 |
+
Same as `inference`, but for only one image.
|
296 |
+
"""
|
297 |
+
pred = self._decode_multi_level_predictions(
|
298 |
+
anchors,
|
299 |
+
box_cls,
|
300 |
+
box_delta,
|
301 |
+
self.test_score_thresh,
|
302 |
+
self.test_topk_candidates,
|
303 |
+
image_size,
|
304 |
+
)
|
305 |
+
keep = batched_nms( # per-class NMS
|
306 |
+
pred.pred_boxes.tensor, pred.scores, pred.pred_classes, self.test_nms_thresh
|
307 |
+
)
|
308 |
+
return pred[keep[: self.max_detections_per_image]]
|
309 |
+
|
310 |
+
|
311 |
+
class RetinaNetHead(nn.Module):
|
312 |
+
"""
|
313 |
+
The head used in RetinaNet for object classification and box regression.
|
314 |
+
It has two subnets for the two tasks, with a common structure but separate parameters.
|
315 |
+
"""
|
316 |
+
|
317 |
+
@configurable
|
318 |
+
def __init__(
|
319 |
+
self,
|
320 |
+
*,
|
321 |
+
input_shape: List[ShapeSpec],
|
322 |
+
num_classes,
|
323 |
+
num_anchors,
|
324 |
+
conv_dims: List[int],
|
325 |
+
norm="",
|
326 |
+
prior_prob=0.01,
|
327 |
+
):
|
328 |
+
"""
|
329 |
+
NOTE: this interface is experimental.
|
330 |
+
|
331 |
+
Args:
|
332 |
+
input_shape (List[ShapeSpec]): input shape
|
333 |
+
num_classes (int): number of classes. Used to label background proposals.
|
334 |
+
num_anchors (int): number of generated anchors
|
335 |
+
conv_dims (List[int]): dimensions for each convolution layer
|
336 |
+
norm (str or callable):
|
337 |
+
Normalization for conv layers except for the two output layers.
|
338 |
+
See :func:`detectron2.layers.get_norm` for supported types.
|
339 |
+
prior_prob (float): Prior weight for computing bias
|
340 |
+
"""
|
341 |
+
super().__init__()
|
342 |
+
|
343 |
+
self._num_features = len(input_shape)
|
344 |
+
if norm == "BN" or norm == "SyncBN":
|
345 |
+
logger.info(
|
346 |
+
f"Using domain-specific {norm} in RetinaNetHead with len={self._num_features}."
|
347 |
+
)
|
348 |
+
bn_class = nn.BatchNorm2d if norm == "BN" else nn.SyncBatchNorm
|
349 |
+
|
350 |
+
def norm(c):
|
351 |
+
return CycleBatchNormList(
|
352 |
+
length=self._num_features, bn_class=bn_class, num_features=c
|
353 |
+
)
|
354 |
+
|
355 |
+
else:
|
356 |
+
norm_name = str(type(get_norm(norm, 32)))
|
357 |
+
if "BN" in norm_name:
|
358 |
+
logger.warning(
|
359 |
+
f"Shared BatchNorm (type={norm_name}) may not work well in RetinaNetHead."
|
360 |
+
)
|
361 |
+
|
362 |
+
cls_subnet = []
|
363 |
+
bbox_subnet = []
|
364 |
+
for in_channels, out_channels in zip(
|
365 |
+
[input_shape[0].channels] + list(conv_dims), conv_dims
|
366 |
+
):
|
367 |
+
cls_subnet.append(
|
368 |
+
nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
|
369 |
+
)
|
370 |
+
if norm:
|
371 |
+
cls_subnet.append(get_norm(norm, out_channels))
|
372 |
+
cls_subnet.append(nn.ReLU())
|
373 |
+
bbox_subnet.append(
|
374 |
+
nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
|
375 |
+
)
|
376 |
+
if norm:
|
377 |
+
bbox_subnet.append(get_norm(norm, out_channels))
|
378 |
+
bbox_subnet.append(nn.ReLU())
|
379 |
+
|
380 |
+
self.cls_subnet = nn.Sequential(*cls_subnet)
|
381 |
+
self.bbox_subnet = nn.Sequential(*bbox_subnet)
|
382 |
+
self.cls_score = nn.Conv2d(
|
383 |
+
conv_dims[-1], num_anchors * num_classes, kernel_size=3, stride=1, padding=1
|
384 |
+
)
|
385 |
+
self.bbox_pred = nn.Conv2d(
|
386 |
+
conv_dims[-1], num_anchors * 4, kernel_size=3, stride=1, padding=1
|
387 |
+
)
|
388 |
+
|
389 |
+
# Initialization
|
390 |
+
for modules in [self.cls_subnet, self.bbox_subnet, self.cls_score, self.bbox_pred]:
|
391 |
+
for layer in modules.modules():
|
392 |
+
if isinstance(layer, nn.Conv2d):
|
393 |
+
torch.nn.init.normal_(layer.weight, mean=0, std=0.01)
|
394 |
+
torch.nn.init.constant_(layer.bias, 0)
|
395 |
+
|
396 |
+
# Use prior in model initialization to improve stability
|
397 |
+
bias_value = -(math.log((1 - prior_prob) / prior_prob))
|
398 |
+
torch.nn.init.constant_(self.cls_score.bias, bias_value)
|
399 |
+
|
400 |
+
@classmethod
|
401 |
+
def from_config(cls, cfg, input_shape: List[ShapeSpec]):
|
402 |
+
num_anchors = build_anchor_generator(cfg, input_shape).num_cell_anchors
|
403 |
+
assert (
|
404 |
+
len(set(num_anchors)) == 1
|
405 |
+
), "Using different number of anchors between levels is not currently supported!"
|
406 |
+
num_anchors = num_anchors[0]
|
407 |
+
|
408 |
+
return {
|
409 |
+
"input_shape": input_shape,
|
410 |
+
"num_classes": cfg.MODEL.RETINANET.NUM_CLASSES,
|
411 |
+
"conv_dims": [input_shape[0].channels] * cfg.MODEL.RETINANET.NUM_CONVS,
|
412 |
+
"prior_prob": cfg.MODEL.RETINANET.PRIOR_PROB,
|
413 |
+
"norm": cfg.MODEL.RETINANET.NORM,
|
414 |
+
"num_anchors": num_anchors,
|
415 |
+
}
|
416 |
+
|
417 |
+
def forward(self, features: List[Tensor]):
|
418 |
+
"""
|
419 |
+
Arguments:
|
420 |
+
features (list[Tensor]): FPN feature map tensors in high to low resolution.
|
421 |
+
Each tensor in the list correspond to different feature levels.
|
422 |
+
|
423 |
+
Returns:
|
424 |
+
logits (list[Tensor]): #lvl tensors, each has shape (N, AxK, Hi, Wi).
|
425 |
+
The tensor predicts the classification probability
|
426 |
+
at each spatial position for each of the A anchors and K object
|
427 |
+
classes.
|
428 |
+
bbox_reg (list[Tensor]): #lvl tensors, each has shape (N, Ax4, Hi, Wi).
|
429 |
+
The tensor predicts 4-vector (dx,dy,dw,dh) box
|
430 |
+
regression values for every anchor. These values are the
|
431 |
+
relative offset between the anchor and the ground truth box.
|
432 |
+
"""
|
433 |
+
assert len(features) == self._num_features
|
434 |
+
logits = []
|
435 |
+
bbox_reg = []
|
436 |
+
for feature in features:
|
437 |
+
logits.append(self.cls_score(self.cls_subnet(feature)))
|
438 |
+
bbox_reg.append(self.bbox_pred(self.bbox_subnet(feature)))
|
439 |
+
return logits, bbox_reg
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/semantic_seg.py
ADDED
@@ -0,0 +1,267 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
import numpy as np
|
3 |
+
from typing import Callable, Dict, Optional, Tuple, Union
|
4 |
+
import fvcore.nn.weight_init as weight_init
|
5 |
+
import torch
|
6 |
+
from torch import nn
|
7 |
+
from torch.nn import functional as F
|
8 |
+
|
9 |
+
from annotator.oneformer.detectron2.config import configurable
|
10 |
+
from annotator.oneformer.detectron2.layers import Conv2d, ShapeSpec, get_norm
|
11 |
+
from annotator.oneformer.detectron2.structures import ImageList
|
12 |
+
from annotator.oneformer.detectron2.utils.registry import Registry
|
13 |
+
|
14 |
+
from ..backbone import Backbone, build_backbone
|
15 |
+
from ..postprocessing import sem_seg_postprocess
|
16 |
+
from .build import META_ARCH_REGISTRY
|
17 |
+
|
18 |
+
__all__ = [
|
19 |
+
"SemanticSegmentor",
|
20 |
+
"SEM_SEG_HEADS_REGISTRY",
|
21 |
+
"SemSegFPNHead",
|
22 |
+
"build_sem_seg_head",
|
23 |
+
]
|
24 |
+
|
25 |
+
|
26 |
+
SEM_SEG_HEADS_REGISTRY = Registry("SEM_SEG_HEADS")
|
27 |
+
SEM_SEG_HEADS_REGISTRY.__doc__ = """
|
28 |
+
Registry for semantic segmentation heads, which make semantic segmentation predictions
|
29 |
+
from feature maps.
|
30 |
+
"""
|
31 |
+
|
32 |
+
|
33 |
+
@META_ARCH_REGISTRY.register()
|
34 |
+
class SemanticSegmentor(nn.Module):
|
35 |
+
"""
|
36 |
+
Main class for semantic segmentation architectures.
|
37 |
+
"""
|
38 |
+
|
39 |
+
@configurable
|
40 |
+
def __init__(
|
41 |
+
self,
|
42 |
+
*,
|
43 |
+
backbone: Backbone,
|
44 |
+
sem_seg_head: nn.Module,
|
45 |
+
pixel_mean: Tuple[float],
|
46 |
+
pixel_std: Tuple[float],
|
47 |
+
):
|
48 |
+
"""
|
49 |
+
Args:
|
50 |
+
backbone: a backbone module, must follow detectron2's backbone interface
|
51 |
+
sem_seg_head: a module that predicts semantic segmentation from backbone features
|
52 |
+
pixel_mean, pixel_std: list or tuple with #channels element, representing
|
53 |
+
the per-channel mean and std to be used to normalize the input image
|
54 |
+
"""
|
55 |
+
super().__init__()
|
56 |
+
self.backbone = backbone
|
57 |
+
self.sem_seg_head = sem_seg_head
|
58 |
+
self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
|
59 |
+
self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
|
60 |
+
|
61 |
+
@classmethod
|
62 |
+
def from_config(cls, cfg):
|
63 |
+
backbone = build_backbone(cfg)
|
64 |
+
sem_seg_head = build_sem_seg_head(cfg, backbone.output_shape())
|
65 |
+
return {
|
66 |
+
"backbone": backbone,
|
67 |
+
"sem_seg_head": sem_seg_head,
|
68 |
+
"pixel_mean": cfg.MODEL.PIXEL_MEAN,
|
69 |
+
"pixel_std": cfg.MODEL.PIXEL_STD,
|
70 |
+
}
|
71 |
+
|
72 |
+
@property
|
73 |
+
def device(self):
|
74 |
+
return self.pixel_mean.device
|
75 |
+
|
76 |
+
def forward(self, batched_inputs):
|
77 |
+
"""
|
78 |
+
Args:
|
79 |
+
batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
|
80 |
+
Each item in the list contains the inputs for one image.
|
81 |
+
|
82 |
+
For now, each item in the list is a dict that contains:
|
83 |
+
|
84 |
+
* "image": Tensor, image in (C, H, W) format.
|
85 |
+
* "sem_seg": semantic segmentation ground truth
|
86 |
+
* Other information that's included in the original dicts, such as:
|
87 |
+
"height", "width" (int): the output resolution of the model (may be different
|
88 |
+
from input resolution), used in inference.
|
89 |
+
|
90 |
+
|
91 |
+
Returns:
|
92 |
+
list[dict]:
|
93 |
+
Each dict is the output for one input image.
|
94 |
+
The dict contains one key "sem_seg" whose value is a
|
95 |
+
Tensor that represents the
|
96 |
+
per-pixel segmentation prediced by the head.
|
97 |
+
The prediction has shape KxHxW that represents the logits of
|
98 |
+
each class for each pixel.
|
99 |
+
"""
|
100 |
+
images = [x["image"].to(self.device) for x in batched_inputs]
|
101 |
+
images = [(x - self.pixel_mean) / self.pixel_std for x in images]
|
102 |
+
images = ImageList.from_tensors(
|
103 |
+
images,
|
104 |
+
self.backbone.size_divisibility,
|
105 |
+
padding_constraints=self.backbone.padding_constraints,
|
106 |
+
)
|
107 |
+
|
108 |
+
features = self.backbone(images.tensor)
|
109 |
+
|
110 |
+
if "sem_seg" in batched_inputs[0]:
|
111 |
+
targets = [x["sem_seg"].to(self.device) for x in batched_inputs]
|
112 |
+
targets = ImageList.from_tensors(
|
113 |
+
targets,
|
114 |
+
self.backbone.size_divisibility,
|
115 |
+
self.sem_seg_head.ignore_value,
|
116 |
+
self.backbone.padding_constraints,
|
117 |
+
).tensor
|
118 |
+
else:
|
119 |
+
targets = None
|
120 |
+
results, losses = self.sem_seg_head(features, targets)
|
121 |
+
|
122 |
+
if self.training:
|
123 |
+
return losses
|
124 |
+
|
125 |
+
processed_results = []
|
126 |
+
for result, input_per_image, image_size in zip(results, batched_inputs, images.image_sizes):
|
127 |
+
height = input_per_image.get("height", image_size[0])
|
128 |
+
width = input_per_image.get("width", image_size[1])
|
129 |
+
r = sem_seg_postprocess(result, image_size, height, width)
|
130 |
+
processed_results.append({"sem_seg": r})
|
131 |
+
return processed_results
|
132 |
+
|
133 |
+
|
134 |
+
def build_sem_seg_head(cfg, input_shape):
|
135 |
+
"""
|
136 |
+
Build a semantic segmentation head from `cfg.MODEL.SEM_SEG_HEAD.NAME`.
|
137 |
+
"""
|
138 |
+
name = cfg.MODEL.SEM_SEG_HEAD.NAME
|
139 |
+
return SEM_SEG_HEADS_REGISTRY.get(name)(cfg, input_shape)
|
140 |
+
|
141 |
+
|
142 |
+
@SEM_SEG_HEADS_REGISTRY.register()
|
143 |
+
class SemSegFPNHead(nn.Module):
|
144 |
+
"""
|
145 |
+
A semantic segmentation head described in :paper:`PanopticFPN`.
|
146 |
+
It takes a list of FPN features as input, and applies a sequence of
|
147 |
+
3x3 convs and upsampling to scale all of them to the stride defined by
|
148 |
+
``common_stride``. Then these features are added and used to make final
|
149 |
+
predictions by another 1x1 conv layer.
|
150 |
+
"""
|
151 |
+
|
152 |
+
@configurable
|
153 |
+
def __init__(
|
154 |
+
self,
|
155 |
+
input_shape: Dict[str, ShapeSpec],
|
156 |
+
*,
|
157 |
+
num_classes: int,
|
158 |
+
conv_dims: int,
|
159 |
+
common_stride: int,
|
160 |
+
loss_weight: float = 1.0,
|
161 |
+
norm: Optional[Union[str, Callable]] = None,
|
162 |
+
ignore_value: int = -1,
|
163 |
+
):
|
164 |
+
"""
|
165 |
+
NOTE: this interface is experimental.
|
166 |
+
|
167 |
+
Args:
|
168 |
+
input_shape: shapes (channels and stride) of the input features
|
169 |
+
num_classes: number of classes to predict
|
170 |
+
conv_dims: number of output channels for the intermediate conv layers.
|
171 |
+
common_stride: the common stride that all features will be upscaled to
|
172 |
+
loss_weight: loss weight
|
173 |
+
norm (str or callable): normalization for all conv layers
|
174 |
+
ignore_value: category id to be ignored during training.
|
175 |
+
"""
|
176 |
+
super().__init__()
|
177 |
+
input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
|
178 |
+
if not len(input_shape):
|
179 |
+
raise ValueError("SemSegFPNHead(input_shape=) cannot be empty!")
|
180 |
+
self.in_features = [k for k, v in input_shape]
|
181 |
+
feature_strides = [v.stride for k, v in input_shape]
|
182 |
+
feature_channels = [v.channels for k, v in input_shape]
|
183 |
+
|
184 |
+
self.ignore_value = ignore_value
|
185 |
+
self.common_stride = common_stride
|
186 |
+
self.loss_weight = loss_weight
|
187 |
+
|
188 |
+
self.scale_heads = []
|
189 |
+
for in_feature, stride, channels in zip(
|
190 |
+
self.in_features, feature_strides, feature_channels
|
191 |
+
):
|
192 |
+
head_ops = []
|
193 |
+
head_length = max(1, int(np.log2(stride) - np.log2(self.common_stride)))
|
194 |
+
for k in range(head_length):
|
195 |
+
norm_module = get_norm(norm, conv_dims)
|
196 |
+
conv = Conv2d(
|
197 |
+
channels if k == 0 else conv_dims,
|
198 |
+
conv_dims,
|
199 |
+
kernel_size=3,
|
200 |
+
stride=1,
|
201 |
+
padding=1,
|
202 |
+
bias=not norm,
|
203 |
+
norm=norm_module,
|
204 |
+
activation=F.relu,
|
205 |
+
)
|
206 |
+
weight_init.c2_msra_fill(conv)
|
207 |
+
head_ops.append(conv)
|
208 |
+
if stride != self.common_stride:
|
209 |
+
head_ops.append(
|
210 |
+
nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False)
|
211 |
+
)
|
212 |
+
self.scale_heads.append(nn.Sequential(*head_ops))
|
213 |
+
self.add_module(in_feature, self.scale_heads[-1])
|
214 |
+
self.predictor = Conv2d(conv_dims, num_classes, kernel_size=1, stride=1, padding=0)
|
215 |
+
weight_init.c2_msra_fill(self.predictor)
|
216 |
+
|
217 |
+
@classmethod
|
218 |
+
def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
|
219 |
+
return {
|
220 |
+
"input_shape": {
|
221 |
+
k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
|
222 |
+
},
|
223 |
+
"ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
|
224 |
+
"num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
|
225 |
+
"conv_dims": cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM,
|
226 |
+
"common_stride": cfg.MODEL.SEM_SEG_HEAD.COMMON_STRIDE,
|
227 |
+
"norm": cfg.MODEL.SEM_SEG_HEAD.NORM,
|
228 |
+
"loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT,
|
229 |
+
}
|
230 |
+
|
231 |
+
def forward(self, features, targets=None):
|
232 |
+
"""
|
233 |
+
Returns:
|
234 |
+
In training, returns (None, dict of losses)
|
235 |
+
In inference, returns (CxHxW logits, {})
|
236 |
+
"""
|
237 |
+
x = self.layers(features)
|
238 |
+
if self.training:
|
239 |
+
return None, self.losses(x, targets)
|
240 |
+
else:
|
241 |
+
x = F.interpolate(
|
242 |
+
x, scale_factor=self.common_stride, mode="bilinear", align_corners=False
|
243 |
+
)
|
244 |
+
return x, {}
|
245 |
+
|
246 |
+
def layers(self, features):
|
247 |
+
for i, f in enumerate(self.in_features):
|
248 |
+
if i == 0:
|
249 |
+
x = self.scale_heads[i](features[f])
|
250 |
+
else:
|
251 |
+
x = x + self.scale_heads[i](features[f])
|
252 |
+
x = self.predictor(x)
|
253 |
+
return x
|
254 |
+
|
255 |
+
def losses(self, predictions, targets):
|
256 |
+
predictions = predictions.float() # https://github.com/pytorch/pytorch/issues/48163
|
257 |
+
predictions = F.interpolate(
|
258 |
+
predictions,
|
259 |
+
scale_factor=self.common_stride,
|
260 |
+
mode="bilinear",
|
261 |
+
align_corners=False,
|
262 |
+
)
|
263 |
+
loss = F.cross_entropy(
|
264 |
+
predictions, targets, reduction="mean", ignore_index=self.ignore_value
|
265 |
+
)
|
266 |
+
losses = {"loss_sem_seg": loss * self.loss_weight}
|
267 |
+
return losses
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/mmdet_wrapper.py
ADDED
@@ -0,0 +1,273 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
import itertools
|
3 |
+
import logging
|
4 |
+
import numpy as np
|
5 |
+
from collections import OrderedDict
|
6 |
+
from collections.abc import Mapping
|
7 |
+
from typing import Dict, List, Optional, Tuple, Union
|
8 |
+
import torch
|
9 |
+
from omegaconf import DictConfig, OmegaConf
|
10 |
+
from torch import Tensor, nn
|
11 |
+
|
12 |
+
from annotator.oneformer.detectron2.layers import ShapeSpec
|
13 |
+
from annotator.oneformer.detectron2.structures import BitMasks, Boxes, ImageList, Instances
|
14 |
+
from annotator.oneformer.detectron2.utils.events import get_event_storage
|
15 |
+
|
16 |
+
from .backbone import Backbone
|
17 |
+
|
18 |
+
logger = logging.getLogger(__name__)
|
19 |
+
|
20 |
+
|
21 |
+
def _to_container(cfg):
|
22 |
+
"""
|
23 |
+
mmdet will assert the type of dict/list.
|
24 |
+
So convert omegaconf objects to dict/list.
|
25 |
+
"""
|
26 |
+
if isinstance(cfg, DictConfig):
|
27 |
+
cfg = OmegaConf.to_container(cfg, resolve=True)
|
28 |
+
from mmcv.utils import ConfigDict
|
29 |
+
|
30 |
+
return ConfigDict(cfg)
|
31 |
+
|
32 |
+
|
33 |
+
class MMDetBackbone(Backbone):
|
34 |
+
"""
|
35 |
+
Wrapper of mmdetection backbones to use in detectron2.
|
36 |
+
|
37 |
+
mmdet backbones produce list/tuple of tensors, while detectron2 backbones
|
38 |
+
produce a dict of tensors. This class wraps the given backbone to produce
|
39 |
+
output in detectron2's convention, so it can be used in place of detectron2
|
40 |
+
backbones.
|
41 |
+
"""
|
42 |
+
|
43 |
+
def __init__(
|
44 |
+
self,
|
45 |
+
backbone: Union[nn.Module, Mapping],
|
46 |
+
neck: Union[nn.Module, Mapping, None] = None,
|
47 |
+
*,
|
48 |
+
output_shapes: List[ShapeSpec],
|
49 |
+
output_names: Optional[List[str]] = None,
|
50 |
+
):
|
51 |
+
"""
|
52 |
+
Args:
|
53 |
+
backbone: either a backbone module or a mmdet config dict that defines a
|
54 |
+
backbone. The backbone takes a 4D image tensor and returns a
|
55 |
+
sequence of tensors.
|
56 |
+
neck: either a backbone module or a mmdet config dict that defines a
|
57 |
+
neck. The neck takes outputs of backbone and returns a
|
58 |
+
sequence of tensors. If None, no neck is used.
|
59 |
+
output_shapes: shape for every output of the backbone (or neck, if given).
|
60 |
+
stride and channels are often needed.
|
61 |
+
output_names: names for every output of the backbone (or neck, if given).
|
62 |
+
By default, will use "out0", "out1", ...
|
63 |
+
"""
|
64 |
+
super().__init__()
|
65 |
+
if isinstance(backbone, Mapping):
|
66 |
+
from mmdet.models import build_backbone
|
67 |
+
|
68 |
+
backbone = build_backbone(_to_container(backbone))
|
69 |
+
self.backbone = backbone
|
70 |
+
|
71 |
+
if isinstance(neck, Mapping):
|
72 |
+
from mmdet.models import build_neck
|
73 |
+
|
74 |
+
neck = build_neck(_to_container(neck))
|
75 |
+
self.neck = neck
|
76 |
+
|
77 |
+
# "Neck" weights, if any, are part of neck itself. This is the interface
|
78 |
+
# of mmdet so we follow it. Reference:
|
79 |
+
# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/two_stage.py
|
80 |
+
logger.info("Initializing mmdet backbone weights...")
|
81 |
+
self.backbone.init_weights()
|
82 |
+
# train() in mmdet modules is non-trivial, and has to be explicitly
|
83 |
+
# called. Reference:
|
84 |
+
# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/backbones/resnet.py
|
85 |
+
self.backbone.train()
|
86 |
+
if self.neck is not None:
|
87 |
+
logger.info("Initializing mmdet neck weights ...")
|
88 |
+
if isinstance(self.neck, nn.Sequential):
|
89 |
+
for m in self.neck:
|
90 |
+
m.init_weights()
|
91 |
+
else:
|
92 |
+
self.neck.init_weights()
|
93 |
+
self.neck.train()
|
94 |
+
|
95 |
+
self._output_shapes = output_shapes
|
96 |
+
if not output_names:
|
97 |
+
output_names = [f"out{i}" for i in range(len(output_shapes))]
|
98 |
+
self._output_names = output_names
|
99 |
+
|
100 |
+
def forward(self, x) -> Dict[str, Tensor]:
|
101 |
+
outs = self.backbone(x)
|
102 |
+
if self.neck is not None:
|
103 |
+
outs = self.neck(outs)
|
104 |
+
assert isinstance(
|
105 |
+
outs, (list, tuple)
|
106 |
+
), "mmdet backbone should return a list/tuple of tensors!"
|
107 |
+
if len(outs) != len(self._output_shapes):
|
108 |
+
raise ValueError(
|
109 |
+
"Length of output_shapes does not match outputs from the mmdet backbone: "
|
110 |
+
f"{len(outs)} != {len(self._output_shapes)}"
|
111 |
+
)
|
112 |
+
return {k: v for k, v in zip(self._output_names, outs)}
|
113 |
+
|
114 |
+
def output_shape(self) -> Dict[str, ShapeSpec]:
|
115 |
+
return {k: v for k, v in zip(self._output_names, self._output_shapes)}
|
116 |
+
|
117 |
+
|
118 |
+
class MMDetDetector(nn.Module):
|
119 |
+
"""
|
120 |
+
Wrapper of a mmdetection detector model, for detection and instance segmentation.
|
121 |
+
Input/output formats of this class follow detectron2's convention, so a
|
122 |
+
mmdetection model can be trained and evaluated in detectron2.
|
123 |
+
"""
|
124 |
+
|
125 |
+
def __init__(
|
126 |
+
self,
|
127 |
+
detector: Union[nn.Module, Mapping],
|
128 |
+
*,
|
129 |
+
# Default is 32 regardless of model:
|
130 |
+
# https://github.com/open-mmlab/mmdetection/tree/master/configs/_base_/datasets
|
131 |
+
size_divisibility=32,
|
132 |
+
pixel_mean: Tuple[float],
|
133 |
+
pixel_std: Tuple[float],
|
134 |
+
):
|
135 |
+
"""
|
136 |
+
Args:
|
137 |
+
detector: a mmdet detector, or a mmdet config dict that defines a detector.
|
138 |
+
size_divisibility: pad input images to multiple of this number
|
139 |
+
pixel_mean: per-channel mean to normalize input image
|
140 |
+
pixel_std: per-channel stddev to normalize input image
|
141 |
+
"""
|
142 |
+
super().__init__()
|
143 |
+
if isinstance(detector, Mapping):
|
144 |
+
from mmdet.models import build_detector
|
145 |
+
|
146 |
+
detector = build_detector(_to_container(detector))
|
147 |
+
self.detector = detector
|
148 |
+
self.detector.init_weights()
|
149 |
+
self.size_divisibility = size_divisibility
|
150 |
+
|
151 |
+
self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
|
152 |
+
self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
|
153 |
+
assert (
|
154 |
+
self.pixel_mean.shape == self.pixel_std.shape
|
155 |
+
), f"{self.pixel_mean} and {self.pixel_std} have different shapes!"
|
156 |
+
|
157 |
+
def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]):
|
158 |
+
images = [x["image"].to(self.device) for x in batched_inputs]
|
159 |
+
images = [(x - self.pixel_mean) / self.pixel_std for x in images]
|
160 |
+
images = ImageList.from_tensors(images, size_divisibility=self.size_divisibility).tensor
|
161 |
+
metas = []
|
162 |
+
rescale = {"height" in x for x in batched_inputs}
|
163 |
+
if len(rescale) != 1:
|
164 |
+
raise ValueError("Some inputs have original height/width, but some don't!")
|
165 |
+
rescale = list(rescale)[0]
|
166 |
+
output_shapes = []
|
167 |
+
for input in batched_inputs:
|
168 |
+
meta = {}
|
169 |
+
c, h, w = input["image"].shape
|
170 |
+
meta["img_shape"] = meta["ori_shape"] = (h, w, c)
|
171 |
+
if rescale:
|
172 |
+
scale_factor = np.array(
|
173 |
+
[w / input["width"], h / input["height"]] * 2, dtype="float32"
|
174 |
+
)
|
175 |
+
ori_shape = (input["height"], input["width"])
|
176 |
+
output_shapes.append(ori_shape)
|
177 |
+
meta["ori_shape"] = ori_shape + (c,)
|
178 |
+
else:
|
179 |
+
scale_factor = 1.0
|
180 |
+
output_shapes.append((h, w))
|
181 |
+
meta["scale_factor"] = scale_factor
|
182 |
+
meta["flip"] = False
|
183 |
+
padh, padw = images.shape[-2:]
|
184 |
+
meta["pad_shape"] = (padh, padw, c)
|
185 |
+
metas.append(meta)
|
186 |
+
|
187 |
+
if self.training:
|
188 |
+
gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
|
189 |
+
if gt_instances[0].has("gt_masks"):
|
190 |
+
from mmdet.core import PolygonMasks as mm_PolygonMasks, BitmapMasks as mm_BitMasks
|
191 |
+
|
192 |
+
def convert_mask(m, shape):
|
193 |
+
# mmdet mask format
|
194 |
+
if isinstance(m, BitMasks):
|
195 |
+
return mm_BitMasks(m.tensor.cpu().numpy(), shape[0], shape[1])
|
196 |
+
else:
|
197 |
+
return mm_PolygonMasks(m.polygons, shape[0], shape[1])
|
198 |
+
|
199 |
+
gt_masks = [convert_mask(x.gt_masks, x.image_size) for x in gt_instances]
|
200 |
+
losses_and_metrics = self.detector.forward_train(
|
201 |
+
images,
|
202 |
+
metas,
|
203 |
+
[x.gt_boxes.tensor for x in gt_instances],
|
204 |
+
[x.gt_classes for x in gt_instances],
|
205 |
+
gt_masks=gt_masks,
|
206 |
+
)
|
207 |
+
else:
|
208 |
+
losses_and_metrics = self.detector.forward_train(
|
209 |
+
images,
|
210 |
+
metas,
|
211 |
+
[x.gt_boxes.tensor for x in gt_instances],
|
212 |
+
[x.gt_classes for x in gt_instances],
|
213 |
+
)
|
214 |
+
return _parse_losses(losses_and_metrics)
|
215 |
+
else:
|
216 |
+
results = self.detector.simple_test(images, metas, rescale=rescale)
|
217 |
+
results = [
|
218 |
+
{"instances": _convert_mmdet_result(r, shape)}
|
219 |
+
for r, shape in zip(results, output_shapes)
|
220 |
+
]
|
221 |
+
return results
|
222 |
+
|
223 |
+
@property
|
224 |
+
def device(self):
|
225 |
+
return self.pixel_mean.device
|
226 |
+
|
227 |
+
|
228 |
+
# Reference: show_result() in
|
229 |
+
# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/base.py
|
230 |
+
def _convert_mmdet_result(result, shape: Tuple[int, int]) -> Instances:
|
231 |
+
if isinstance(result, tuple):
|
232 |
+
bbox_result, segm_result = result
|
233 |
+
if isinstance(segm_result, tuple):
|
234 |
+
segm_result = segm_result[0]
|
235 |
+
else:
|
236 |
+
bbox_result, segm_result = result, None
|
237 |
+
|
238 |
+
bboxes = torch.from_numpy(np.vstack(bbox_result)) # Nx5
|
239 |
+
bboxes, scores = bboxes[:, :4], bboxes[:, -1]
|
240 |
+
labels = [
|
241 |
+
torch.full((bbox.shape[0],), i, dtype=torch.int32) for i, bbox in enumerate(bbox_result)
|
242 |
+
]
|
243 |
+
labels = torch.cat(labels)
|
244 |
+
inst = Instances(shape)
|
245 |
+
inst.pred_boxes = Boxes(bboxes)
|
246 |
+
inst.scores = scores
|
247 |
+
inst.pred_classes = labels
|
248 |
+
|
249 |
+
if segm_result is not None and len(labels) > 0:
|
250 |
+
segm_result = list(itertools.chain(*segm_result))
|
251 |
+
segm_result = [torch.from_numpy(x) if isinstance(x, np.ndarray) else x for x in segm_result]
|
252 |
+
segm_result = torch.stack(segm_result, dim=0)
|
253 |
+
inst.pred_masks = segm_result
|
254 |
+
return inst
|
255 |
+
|
256 |
+
|
257 |
+
# reference: https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/base.py
|
258 |
+
def _parse_losses(losses: Dict[str, Tensor]) -> Dict[str, Tensor]:
|
259 |
+
log_vars = OrderedDict()
|
260 |
+
for loss_name, loss_value in losses.items():
|
261 |
+
if isinstance(loss_value, torch.Tensor):
|
262 |
+
log_vars[loss_name] = loss_value.mean()
|
263 |
+
elif isinstance(loss_value, list):
|
264 |
+
log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
|
265 |
+
else:
|
266 |
+
raise TypeError(f"{loss_name} is not a tensor or list of tensors")
|
267 |
+
|
268 |
+
if "loss" not in loss_name:
|
269 |
+
# put metrics to storage; don't return them
|
270 |
+
storage = get_event_storage()
|
271 |
+
value = log_vars.pop(loss_name).cpu().item()
|
272 |
+
storage.put_scalar(loss_name, value)
|
273 |
+
return log_vars
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/poolers.py
ADDED
@@ -0,0 +1,263 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
import math
|
3 |
+
from typing import List, Optional
|
4 |
+
import torch
|
5 |
+
from torch import nn
|
6 |
+
from torchvision.ops import RoIPool
|
7 |
+
|
8 |
+
from annotator.oneformer.detectron2.layers import ROIAlign, ROIAlignRotated, cat, nonzero_tuple, shapes_to_tensor
|
9 |
+
from annotator.oneformer.detectron2.structures import Boxes
|
10 |
+
from annotator.oneformer.detectron2.utils.tracing import assert_fx_safe, is_fx_tracing
|
11 |
+
|
12 |
+
"""
|
13 |
+
To export ROIPooler to torchscript, in this file, variables that should be annotated with
|
14 |
+
`Union[List[Boxes], List[RotatedBoxes]]` are only annotated with `List[Boxes]`.
|
15 |
+
|
16 |
+
TODO: Correct these annotations when torchscript support `Union`.
|
17 |
+
https://github.com/pytorch/pytorch/issues/41412
|
18 |
+
"""
|
19 |
+
|
20 |
+
__all__ = ["ROIPooler"]
|
21 |
+
|
22 |
+
|
23 |
+
def assign_boxes_to_levels(
|
24 |
+
box_lists: List[Boxes],
|
25 |
+
min_level: int,
|
26 |
+
max_level: int,
|
27 |
+
canonical_box_size: int,
|
28 |
+
canonical_level: int,
|
29 |
+
):
|
30 |
+
"""
|
31 |
+
Map each box in `box_lists` to a feature map level index and return the assignment
|
32 |
+
vector.
|
33 |
+
|
34 |
+
Args:
|
35 |
+
box_lists (list[Boxes] | list[RotatedBoxes]): A list of N Boxes or N RotatedBoxes,
|
36 |
+
where N is the number of images in the batch.
|
37 |
+
min_level (int): Smallest feature map level index. The input is considered index 0,
|
38 |
+
the output of stage 1 is index 1, and so.
|
39 |
+
max_level (int): Largest feature map level index.
|
40 |
+
canonical_box_size (int): A canonical box size in pixels (sqrt(box area)).
|
41 |
+
canonical_level (int): The feature map level index on which a canonically-sized box
|
42 |
+
should be placed.
|
43 |
+
|
44 |
+
Returns:
|
45 |
+
A tensor of length M, where M is the total number of boxes aggregated over all
|
46 |
+
N batch images. The memory layout corresponds to the concatenation of boxes
|
47 |
+
from all images. Each element is the feature map index, as an offset from
|
48 |
+
`self.min_level`, for the corresponding box (so value i means the box is at
|
49 |
+
`self.min_level + i`).
|
50 |
+
"""
|
51 |
+
box_sizes = torch.sqrt(cat([boxes.area() for boxes in box_lists]))
|
52 |
+
# Eqn.(1) in FPN paper
|
53 |
+
level_assignments = torch.floor(
|
54 |
+
canonical_level + torch.log2(box_sizes / canonical_box_size + 1e-8)
|
55 |
+
)
|
56 |
+
# clamp level to (min, max), in case the box size is too large or too small
|
57 |
+
# for the available feature maps
|
58 |
+
level_assignments = torch.clamp(level_assignments, min=min_level, max=max_level)
|
59 |
+
return level_assignments.to(torch.int64) - min_level
|
60 |
+
|
61 |
+
|
62 |
+
# script the module to avoid hardcoded device type
|
63 |
+
@torch.jit.script_if_tracing
|
64 |
+
def _convert_boxes_to_pooler_format(boxes: torch.Tensor, sizes: torch.Tensor) -> torch.Tensor:
|
65 |
+
sizes = sizes.to(device=boxes.device)
|
66 |
+
indices = torch.repeat_interleave(
|
67 |
+
torch.arange(len(sizes), dtype=boxes.dtype, device=boxes.device), sizes
|
68 |
+
)
|
69 |
+
return cat([indices[:, None], boxes], dim=1)
|
70 |
+
|
71 |
+
|
72 |
+
def convert_boxes_to_pooler_format(box_lists: List[Boxes]):
|
73 |
+
"""
|
74 |
+
Convert all boxes in `box_lists` to the low-level format used by ROI pooling ops
|
75 |
+
(see description under Returns).
|
76 |
+
|
77 |
+
Args:
|
78 |
+
box_lists (list[Boxes] | list[RotatedBoxes]):
|
79 |
+
A list of N Boxes or N RotatedBoxes, where N is the number of images in the batch.
|
80 |
+
|
81 |
+
Returns:
|
82 |
+
When input is list[Boxes]:
|
83 |
+
A tensor of shape (M, 5), where M is the total number of boxes aggregated over all
|
84 |
+
N batch images.
|
85 |
+
The 5 columns are (batch index, x0, y0, x1, y1), where batch index
|
86 |
+
is the index in [0, N) identifying which batch image the box with corners at
|
87 |
+
(x0, y0, x1, y1) comes from.
|
88 |
+
When input is list[RotatedBoxes]:
|
89 |
+
A tensor of shape (M, 6), where M is the total number of boxes aggregated over all
|
90 |
+
N batch images.
|
91 |
+
The 6 columns are (batch index, x_ctr, y_ctr, width, height, angle_degrees),
|
92 |
+
where batch index is the index in [0, N) identifying which batch image the
|
93 |
+
rotated box (x_ctr, y_ctr, width, height, angle_degrees) comes from.
|
94 |
+
"""
|
95 |
+
boxes = torch.cat([x.tensor for x in box_lists], dim=0)
|
96 |
+
# __len__ returns Tensor in tracing.
|
97 |
+
sizes = shapes_to_tensor([x.__len__() for x in box_lists])
|
98 |
+
return _convert_boxes_to_pooler_format(boxes, sizes)
|
99 |
+
|
100 |
+
|
101 |
+
@torch.jit.script_if_tracing
|
102 |
+
def _create_zeros(
|
103 |
+
batch_target: Optional[torch.Tensor],
|
104 |
+
channels: int,
|
105 |
+
height: int,
|
106 |
+
width: int,
|
107 |
+
like_tensor: torch.Tensor,
|
108 |
+
) -> torch.Tensor:
|
109 |
+
batches = batch_target.shape[0] if batch_target is not None else 0
|
110 |
+
sizes = (batches, channels, height, width)
|
111 |
+
return torch.zeros(sizes, dtype=like_tensor.dtype, device=like_tensor.device)
|
112 |
+
|
113 |
+
|
114 |
+
class ROIPooler(nn.Module):
|
115 |
+
"""
|
116 |
+
Region of interest feature map pooler that supports pooling from one or more
|
117 |
+
feature maps.
|
118 |
+
"""
|
119 |
+
|
120 |
+
def __init__(
|
121 |
+
self,
|
122 |
+
output_size,
|
123 |
+
scales,
|
124 |
+
sampling_ratio,
|
125 |
+
pooler_type,
|
126 |
+
canonical_box_size=224,
|
127 |
+
canonical_level=4,
|
128 |
+
):
|
129 |
+
"""
|
130 |
+
Args:
|
131 |
+
output_size (int, tuple[int] or list[int]): output size of the pooled region,
|
132 |
+
e.g., 14 x 14. If tuple or list is given, the length must be 2.
|
133 |
+
scales (list[float]): The scale for each low-level pooling op relative to
|
134 |
+
the input image. For a feature map with stride s relative to the input
|
135 |
+
image, scale is defined as 1/s. The stride must be power of 2.
|
136 |
+
When there are multiple scales, they must form a pyramid, i.e. they must be
|
137 |
+
a monotically decreasing geometric sequence with a factor of 1/2.
|
138 |
+
sampling_ratio (int): The `sampling_ratio` parameter for the ROIAlign op.
|
139 |
+
pooler_type (string): Name of the type of pooling operation that should be applied.
|
140 |
+
For instance, "ROIPool" or "ROIAlignV2".
|
141 |
+
canonical_box_size (int): A canonical box size in pixels (sqrt(box area)). The default
|
142 |
+
is heuristically defined as 224 pixels in the FPN paper (based on ImageNet
|
143 |
+
pre-training).
|
144 |
+
canonical_level (int): The feature map level index from which a canonically-sized box
|
145 |
+
should be placed. The default is defined as level 4 (stride=16) in the FPN paper,
|
146 |
+
i.e., a box of size 224x224 will be placed on the feature with stride=16.
|
147 |
+
The box placement for all boxes will be determined from their sizes w.r.t
|
148 |
+
canonical_box_size. For example, a box whose area is 4x that of a canonical box
|
149 |
+
should be used to pool features from feature level ``canonical_level+1``.
|
150 |
+
|
151 |
+
Note that the actual input feature maps given to this module may not have
|
152 |
+
sufficiently many levels for the input boxes. If the boxes are too large or too
|
153 |
+
small for the input feature maps, the closest level will be used.
|
154 |
+
"""
|
155 |
+
super().__init__()
|
156 |
+
|
157 |
+
if isinstance(output_size, int):
|
158 |
+
output_size = (output_size, output_size)
|
159 |
+
assert len(output_size) == 2
|
160 |
+
assert isinstance(output_size[0], int) and isinstance(output_size[1], int)
|
161 |
+
self.output_size = output_size
|
162 |
+
|
163 |
+
if pooler_type == "ROIAlign":
|
164 |
+
self.level_poolers = nn.ModuleList(
|
165 |
+
ROIAlign(
|
166 |
+
output_size, spatial_scale=scale, sampling_ratio=sampling_ratio, aligned=False
|
167 |
+
)
|
168 |
+
for scale in scales
|
169 |
+
)
|
170 |
+
elif pooler_type == "ROIAlignV2":
|
171 |
+
self.level_poolers = nn.ModuleList(
|
172 |
+
ROIAlign(
|
173 |
+
output_size, spatial_scale=scale, sampling_ratio=sampling_ratio, aligned=True
|
174 |
+
)
|
175 |
+
for scale in scales
|
176 |
+
)
|
177 |
+
elif pooler_type == "ROIPool":
|
178 |
+
self.level_poolers = nn.ModuleList(
|
179 |
+
RoIPool(output_size, spatial_scale=scale) for scale in scales
|
180 |
+
)
|
181 |
+
elif pooler_type == "ROIAlignRotated":
|
182 |
+
self.level_poolers = nn.ModuleList(
|
183 |
+
ROIAlignRotated(output_size, spatial_scale=scale, sampling_ratio=sampling_ratio)
|
184 |
+
for scale in scales
|
185 |
+
)
|
186 |
+
else:
|
187 |
+
raise ValueError("Unknown pooler type: {}".format(pooler_type))
|
188 |
+
|
189 |
+
# Map scale (defined as 1 / stride) to its feature map level under the
|
190 |
+
# assumption that stride is a power of 2.
|
191 |
+
min_level = -(math.log2(scales[0]))
|
192 |
+
max_level = -(math.log2(scales[-1]))
|
193 |
+
assert math.isclose(min_level, int(min_level)) and math.isclose(
|
194 |
+
max_level, int(max_level)
|
195 |
+
), "Featuremap stride is not power of 2!"
|
196 |
+
self.min_level = int(min_level)
|
197 |
+
self.max_level = int(max_level)
|
198 |
+
assert (
|
199 |
+
len(scales) == self.max_level - self.min_level + 1
|
200 |
+
), "[ROIPooler] Sizes of input featuremaps do not form a pyramid!"
|
201 |
+
assert 0 <= self.min_level and self.min_level <= self.max_level
|
202 |
+
self.canonical_level = canonical_level
|
203 |
+
assert canonical_box_size > 0
|
204 |
+
self.canonical_box_size = canonical_box_size
|
205 |
+
|
206 |
+
def forward(self, x: List[torch.Tensor], box_lists: List[Boxes]):
|
207 |
+
"""
|
208 |
+
Args:
|
209 |
+
x (list[Tensor]): A list of feature maps of NCHW shape, with scales matching those
|
210 |
+
used to construct this module.
|
211 |
+
box_lists (list[Boxes] | list[RotatedBoxes]):
|
212 |
+
A list of N Boxes or N RotatedBoxes, where N is the number of images in the batch.
|
213 |
+
The box coordinates are defined on the original image and
|
214 |
+
will be scaled by the `scales` argument of :class:`ROIPooler`.
|
215 |
+
|
216 |
+
Returns:
|
217 |
+
Tensor:
|
218 |
+
A tensor of shape (M, C, output_size, output_size) where M is the total number of
|
219 |
+
boxes aggregated over all N batch images and C is the number of channels in `x`.
|
220 |
+
"""
|
221 |
+
num_level_assignments = len(self.level_poolers)
|
222 |
+
|
223 |
+
if not is_fx_tracing():
|
224 |
+
torch._assert(
|
225 |
+
isinstance(x, list) and isinstance(box_lists, list),
|
226 |
+
"Arguments to pooler must be lists",
|
227 |
+
)
|
228 |
+
assert_fx_safe(
|
229 |
+
len(x) == num_level_assignments,
|
230 |
+
"unequal value, num_level_assignments={}, but x is list of {} Tensors".format(
|
231 |
+
num_level_assignments, len(x)
|
232 |
+
),
|
233 |
+
)
|
234 |
+
assert_fx_safe(
|
235 |
+
len(box_lists) == x[0].size(0),
|
236 |
+
"unequal value, x[0] batch dim 0 is {}, but box_list has length {}".format(
|
237 |
+
x[0].size(0), len(box_lists)
|
238 |
+
),
|
239 |
+
)
|
240 |
+
if len(box_lists) == 0:
|
241 |
+
return _create_zeros(None, x[0].shape[1], *self.output_size, x[0])
|
242 |
+
|
243 |
+
pooler_fmt_boxes = convert_boxes_to_pooler_format(box_lists)
|
244 |
+
|
245 |
+
if num_level_assignments == 1:
|
246 |
+
return self.level_poolers[0](x[0], pooler_fmt_boxes)
|
247 |
+
|
248 |
+
level_assignments = assign_boxes_to_levels(
|
249 |
+
box_lists, self.min_level, self.max_level, self.canonical_box_size, self.canonical_level
|
250 |
+
)
|
251 |
+
|
252 |
+
num_channels = x[0].shape[1]
|
253 |
+
output_size = self.output_size[0]
|
254 |
+
|
255 |
+
output = _create_zeros(pooler_fmt_boxes, num_channels, output_size, output_size, x[0])
|
256 |
+
|
257 |
+
for level, pooler in enumerate(self.level_poolers):
|
258 |
+
inds = nonzero_tuple(level_assignments == level)[0]
|
259 |
+
pooler_fmt_boxes_level = pooler_fmt_boxes[inds]
|
260 |
+
# Use index_put_ instead of advance indexing, to avoid pytorch/issues/49852
|
261 |
+
output.index_put_((inds,), pooler(x[level], pooler_fmt_boxes_level))
|
262 |
+
|
263 |
+
return output
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/postprocessing.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
import torch
|
3 |
+
from torch.nn import functional as F
|
4 |
+
|
5 |
+
from annotator.oneformer.detectron2.structures import Instances, ROIMasks
|
6 |
+
|
7 |
+
|
8 |
+
# perhaps should rename to "resize_instance"
|
9 |
+
def detector_postprocess(
|
10 |
+
results: Instances, output_height: int, output_width: int, mask_threshold: float = 0.5
|
11 |
+
):
|
12 |
+
"""
|
13 |
+
Resize the output instances.
|
14 |
+
The input images are often resized when entering an object detector.
|
15 |
+
As a result, we often need the outputs of the detector in a different
|
16 |
+
resolution from its inputs.
|
17 |
+
|
18 |
+
This function will resize the raw outputs of an R-CNN detector
|
19 |
+
to produce outputs according to the desired output resolution.
|
20 |
+
|
21 |
+
Args:
|
22 |
+
results (Instances): the raw outputs from the detector.
|
23 |
+
`results.image_size` contains the input image resolution the detector sees.
|
24 |
+
This object might be modified in-place.
|
25 |
+
output_height, output_width: the desired output resolution.
|
26 |
+
Returns:
|
27 |
+
Instances: the resized output from the model, based on the output resolution
|
28 |
+
"""
|
29 |
+
if isinstance(output_width, torch.Tensor):
|
30 |
+
# This shape might (but not necessarily) be tensors during tracing.
|
31 |
+
# Converts integer tensors to float temporaries to ensure true
|
32 |
+
# division is performed when computing scale_x and scale_y.
|
33 |
+
output_width_tmp = output_width.float()
|
34 |
+
output_height_tmp = output_height.float()
|
35 |
+
new_size = torch.stack([output_height, output_width])
|
36 |
+
else:
|
37 |
+
new_size = (output_height, output_width)
|
38 |
+
output_width_tmp = output_width
|
39 |
+
output_height_tmp = output_height
|
40 |
+
|
41 |
+
scale_x, scale_y = (
|
42 |
+
output_width_tmp / results.image_size[1],
|
43 |
+
output_height_tmp / results.image_size[0],
|
44 |
+
)
|
45 |
+
results = Instances(new_size, **results.get_fields())
|
46 |
+
|
47 |
+
if results.has("pred_boxes"):
|
48 |
+
output_boxes = results.pred_boxes
|
49 |
+
elif results.has("proposal_boxes"):
|
50 |
+
output_boxes = results.proposal_boxes
|
51 |
+
else:
|
52 |
+
output_boxes = None
|
53 |
+
assert output_boxes is not None, "Predictions must contain boxes!"
|
54 |
+
|
55 |
+
output_boxes.scale(scale_x, scale_y)
|
56 |
+
output_boxes.clip(results.image_size)
|
57 |
+
|
58 |
+
results = results[output_boxes.nonempty()]
|
59 |
+
|
60 |
+
if results.has("pred_masks"):
|
61 |
+
if isinstance(results.pred_masks, ROIMasks):
|
62 |
+
roi_masks = results.pred_masks
|
63 |
+
else:
|
64 |
+
# pred_masks is a tensor of shape (N, 1, M, M)
|
65 |
+
roi_masks = ROIMasks(results.pred_masks[:, 0, :, :])
|
66 |
+
results.pred_masks = roi_masks.to_bitmasks(
|
67 |
+
results.pred_boxes, output_height, output_width, mask_threshold
|
68 |
+
).tensor # TODO return ROIMasks/BitMask object in the future
|
69 |
+
|
70 |
+
if results.has("pred_keypoints"):
|
71 |
+
results.pred_keypoints[:, :, 0] *= scale_x
|
72 |
+
results.pred_keypoints[:, :, 1] *= scale_y
|
73 |
+
|
74 |
+
return results
|
75 |
+
|
76 |
+
|
77 |
+
def sem_seg_postprocess(result, img_size, output_height, output_width):
|
78 |
+
"""
|
79 |
+
Return semantic segmentation predictions in the original resolution.
|
80 |
+
|
81 |
+
The input images are often resized when entering semantic segmentor. Moreover, in same
|
82 |
+
cases, they also padded inside segmentor to be divisible by maximum network stride.
|
83 |
+
As a result, we often need the predictions of the segmentor in a different
|
84 |
+
resolution from its inputs.
|
85 |
+
|
86 |
+
Args:
|
87 |
+
result (Tensor): semantic segmentation prediction logits. A tensor of shape (C, H, W),
|
88 |
+
where C is the number of classes, and H, W are the height and width of the prediction.
|
89 |
+
img_size (tuple): image size that segmentor is taking as input.
|
90 |
+
output_height, output_width: the desired output resolution.
|
91 |
+
|
92 |
+
Returns:
|
93 |
+
semantic segmentation prediction (Tensor): A tensor of the shape
|
94 |
+
(C, output_height, output_width) that contains per-pixel soft predictions.
|
95 |
+
"""
|
96 |
+
result = result[:, : img_size[0], : img_size[1]].expand(1, -1, -1, -1)
|
97 |
+
result = F.interpolate(
|
98 |
+
result, size=(output_height, output_width), mode="bilinear", align_corners=False
|
99 |
+
)[0]
|
100 |
+
return result
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/proposal_generator/__init__.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
from .build import PROPOSAL_GENERATOR_REGISTRY, build_proposal_generator
|
3 |
+
from .rpn import RPN_HEAD_REGISTRY, build_rpn_head, RPN, StandardRPNHead
|
4 |
+
|
5 |
+
__all__ = list(globals().keys())
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/proposal_generator/build.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
from annotator.oneformer.detectron2.utils.registry import Registry
|
3 |
+
|
4 |
+
PROPOSAL_GENERATOR_REGISTRY = Registry("PROPOSAL_GENERATOR")
|
5 |
+
PROPOSAL_GENERATOR_REGISTRY.__doc__ = """
|
6 |
+
Registry for proposal generator, which produces object proposals from feature maps.
|
7 |
+
|
8 |
+
The registered object will be called with `obj(cfg, input_shape)`.
|
9 |
+
The call should return a `nn.Module` object.
|
10 |
+
"""
|
11 |
+
|
12 |
+
from . import rpn, rrpn # noqa F401 isort:skip
|
13 |
+
|
14 |
+
|
15 |
+
def build_proposal_generator(cfg, input_shape):
|
16 |
+
"""
|
17 |
+
Build a proposal generator from `cfg.MODEL.PROPOSAL_GENERATOR.NAME`.
|
18 |
+
The name can be "PrecomputedProposals" to use no proposal generator.
|
19 |
+
"""
|
20 |
+
name = cfg.MODEL.PROPOSAL_GENERATOR.NAME
|
21 |
+
if name == "PrecomputedProposals":
|
22 |
+
return None
|
23 |
+
|
24 |
+
return PROPOSAL_GENERATOR_REGISTRY.get(name)(cfg, input_shape)
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/proposal_generator/proposal_utils.py
ADDED
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
import logging
|
3 |
+
import math
|
4 |
+
from typing import List, Tuple, Union
|
5 |
+
import torch
|
6 |
+
|
7 |
+
from annotator.oneformer.detectron2.layers import batched_nms, cat, move_device_like
|
8 |
+
from annotator.oneformer.detectron2.structures import Boxes, Instances
|
9 |
+
|
10 |
+
logger = logging.getLogger(__name__)
|
11 |
+
|
12 |
+
|
13 |
+
def _is_tracing():
|
14 |
+
# (fixed in TORCH_VERSION >= 1.9)
|
15 |
+
if torch.jit.is_scripting():
|
16 |
+
# https://github.com/pytorch/pytorch/issues/47379
|
17 |
+
return False
|
18 |
+
else:
|
19 |
+
return torch.jit.is_tracing()
|
20 |
+
|
21 |
+
|
22 |
+
def find_top_rpn_proposals(
|
23 |
+
proposals: List[torch.Tensor],
|
24 |
+
pred_objectness_logits: List[torch.Tensor],
|
25 |
+
image_sizes: List[Tuple[int, int]],
|
26 |
+
nms_thresh: float,
|
27 |
+
pre_nms_topk: int,
|
28 |
+
post_nms_topk: int,
|
29 |
+
min_box_size: float,
|
30 |
+
training: bool,
|
31 |
+
):
|
32 |
+
"""
|
33 |
+
For each feature map, select the `pre_nms_topk` highest scoring proposals,
|
34 |
+
apply NMS, clip proposals, and remove small boxes. Return the `post_nms_topk`
|
35 |
+
highest scoring proposals among all the feature maps for each image.
|
36 |
+
|
37 |
+
Args:
|
38 |
+
proposals (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A, 4).
|
39 |
+
All proposal predictions on the feature maps.
|
40 |
+
pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A).
|
41 |
+
image_sizes (list[tuple]): sizes (h, w) for each image
|
42 |
+
nms_thresh (float): IoU threshold to use for NMS
|
43 |
+
pre_nms_topk (int): number of top k scoring proposals to keep before applying NMS.
|
44 |
+
When RPN is run on multiple feature maps (as in FPN) this number is per
|
45 |
+
feature map.
|
46 |
+
post_nms_topk (int): number of top k scoring proposals to keep after applying NMS.
|
47 |
+
When RPN is run on multiple feature maps (as in FPN) this number is total,
|
48 |
+
over all feature maps.
|
49 |
+
min_box_size (float): minimum proposal box side length in pixels (absolute units
|
50 |
+
wrt input images).
|
51 |
+
training (bool): True if proposals are to be used in training, otherwise False.
|
52 |
+
This arg exists only to support a legacy bug; look for the "NB: Legacy bug ..."
|
53 |
+
comment.
|
54 |
+
|
55 |
+
Returns:
|
56 |
+
list[Instances]: list of N Instances. The i-th Instances
|
57 |
+
stores post_nms_topk object proposals for image i, sorted by their
|
58 |
+
objectness score in descending order.
|
59 |
+
"""
|
60 |
+
num_images = len(image_sizes)
|
61 |
+
device = (
|
62 |
+
proposals[0].device
|
63 |
+
if torch.jit.is_scripting()
|
64 |
+
else ("cpu" if torch.jit.is_tracing() else proposals[0].device)
|
65 |
+
)
|
66 |
+
|
67 |
+
# 1. Select top-k anchor for every level and every image
|
68 |
+
topk_scores = [] # #lvl Tensor, each of shape N x topk
|
69 |
+
topk_proposals = []
|
70 |
+
level_ids = [] # #lvl Tensor, each of shape (topk,)
|
71 |
+
batch_idx = move_device_like(torch.arange(num_images, device=device), proposals[0])
|
72 |
+
for level_id, (proposals_i, logits_i) in enumerate(zip(proposals, pred_objectness_logits)):
|
73 |
+
Hi_Wi_A = logits_i.shape[1]
|
74 |
+
if isinstance(Hi_Wi_A, torch.Tensor): # it's a tensor in tracing
|
75 |
+
num_proposals_i = torch.clamp(Hi_Wi_A, max=pre_nms_topk)
|
76 |
+
else:
|
77 |
+
num_proposals_i = min(Hi_Wi_A, pre_nms_topk)
|
78 |
+
|
79 |
+
topk_scores_i, topk_idx = logits_i.topk(num_proposals_i, dim=1)
|
80 |
+
|
81 |
+
# each is N x topk
|
82 |
+
topk_proposals_i = proposals_i[batch_idx[:, None], topk_idx] # N x topk x 4
|
83 |
+
|
84 |
+
topk_proposals.append(topk_proposals_i)
|
85 |
+
topk_scores.append(topk_scores_i)
|
86 |
+
level_ids.append(
|
87 |
+
move_device_like(
|
88 |
+
torch.full((num_proposals_i,), level_id, dtype=torch.int64, device=device),
|
89 |
+
proposals[0],
|
90 |
+
)
|
91 |
+
)
|
92 |
+
|
93 |
+
# 2. Concat all levels together
|
94 |
+
topk_scores = cat(topk_scores, dim=1)
|
95 |
+
topk_proposals = cat(topk_proposals, dim=1)
|
96 |
+
level_ids = cat(level_ids, dim=0)
|
97 |
+
|
98 |
+
# 3. For each image, run a per-level NMS, and choose topk results.
|
99 |
+
results: List[Instances] = []
|
100 |
+
for n, image_size in enumerate(image_sizes):
|
101 |
+
boxes = Boxes(topk_proposals[n])
|
102 |
+
scores_per_img = topk_scores[n]
|
103 |
+
lvl = level_ids
|
104 |
+
|
105 |
+
valid_mask = torch.isfinite(boxes.tensor).all(dim=1) & torch.isfinite(scores_per_img)
|
106 |
+
if not valid_mask.all():
|
107 |
+
if training:
|
108 |
+
raise FloatingPointError(
|
109 |
+
"Predicted boxes or scores contain Inf/NaN. Training has diverged."
|
110 |
+
)
|
111 |
+
boxes = boxes[valid_mask]
|
112 |
+
scores_per_img = scores_per_img[valid_mask]
|
113 |
+
lvl = lvl[valid_mask]
|
114 |
+
boxes.clip(image_size)
|
115 |
+
|
116 |
+
# filter empty boxes
|
117 |
+
keep = boxes.nonempty(threshold=min_box_size)
|
118 |
+
if _is_tracing() or keep.sum().item() != len(boxes):
|
119 |
+
boxes, scores_per_img, lvl = boxes[keep], scores_per_img[keep], lvl[keep]
|
120 |
+
|
121 |
+
keep = batched_nms(boxes.tensor, scores_per_img, lvl, nms_thresh)
|
122 |
+
# In Detectron1, there was different behavior during training vs. testing.
|
123 |
+
# (https://github.com/facebookresearch/Detectron/issues/459)
|
124 |
+
# During training, topk is over the proposals from *all* images in the training batch.
|
125 |
+
# During testing, it is over the proposals for each image separately.
|
126 |
+
# As a result, the training behavior becomes batch-dependent,
|
127 |
+
# and the configuration "POST_NMS_TOPK_TRAIN" end up relying on the batch size.
|
128 |
+
# This bug is addressed in Detectron2 to make the behavior independent of batch size.
|
129 |
+
keep = keep[:post_nms_topk] # keep is already sorted
|
130 |
+
|
131 |
+
res = Instances(image_size)
|
132 |
+
res.proposal_boxes = boxes[keep]
|
133 |
+
res.objectness_logits = scores_per_img[keep]
|
134 |
+
results.append(res)
|
135 |
+
return results
|
136 |
+
|
137 |
+
|
138 |
+
def add_ground_truth_to_proposals(
|
139 |
+
gt: Union[List[Instances], List[Boxes]], proposals: List[Instances]
|
140 |
+
) -> List[Instances]:
|
141 |
+
"""
|
142 |
+
Call `add_ground_truth_to_proposals_single_image` for all images.
|
143 |
+
|
144 |
+
Args:
|
145 |
+
gt(Union[List[Instances], List[Boxes]): list of N elements. Element i is a Instances
|
146 |
+
representing the ground-truth for image i.
|
147 |
+
proposals (list[Instances]): list of N elements. Element i is a Instances
|
148 |
+
representing the proposals for image i.
|
149 |
+
|
150 |
+
Returns:
|
151 |
+
list[Instances]: list of N Instances. Each is the proposals for the image,
|
152 |
+
with field "proposal_boxes" and "objectness_logits".
|
153 |
+
"""
|
154 |
+
assert gt is not None
|
155 |
+
|
156 |
+
if len(proposals) != len(gt):
|
157 |
+
raise ValueError("proposals and gt should have the same length as the number of images!")
|
158 |
+
if len(proposals) == 0:
|
159 |
+
return proposals
|
160 |
+
|
161 |
+
return [
|
162 |
+
add_ground_truth_to_proposals_single_image(gt_i, proposals_i)
|
163 |
+
for gt_i, proposals_i in zip(gt, proposals)
|
164 |
+
]
|
165 |
+
|
166 |
+
|
167 |
+
def add_ground_truth_to_proposals_single_image(
|
168 |
+
gt: Union[Instances, Boxes], proposals: Instances
|
169 |
+
) -> Instances:
|
170 |
+
"""
|
171 |
+
Augment `proposals` with `gt`.
|
172 |
+
|
173 |
+
Args:
|
174 |
+
Same as `add_ground_truth_to_proposals`, but with gt and proposals
|
175 |
+
per image.
|
176 |
+
|
177 |
+
Returns:
|
178 |
+
Same as `add_ground_truth_to_proposals`, but for only one image.
|
179 |
+
"""
|
180 |
+
if isinstance(gt, Boxes):
|
181 |
+
# convert Boxes to Instances
|
182 |
+
gt = Instances(proposals.image_size, gt_boxes=gt)
|
183 |
+
|
184 |
+
gt_boxes = gt.gt_boxes
|
185 |
+
device = proposals.objectness_logits.device
|
186 |
+
# Assign all ground-truth boxes an objectness logit corresponding to
|
187 |
+
# P(object) = sigmoid(logit) =~ 1.
|
188 |
+
gt_logit_value = math.log((1.0 - 1e-10) / (1 - (1.0 - 1e-10)))
|
189 |
+
gt_logits = gt_logit_value * torch.ones(len(gt_boxes), device=device)
|
190 |
+
|
191 |
+
# Concatenating gt_boxes with proposals requires them to have the same fields
|
192 |
+
gt_proposal = Instances(proposals.image_size, **gt.get_fields())
|
193 |
+
gt_proposal.proposal_boxes = gt_boxes
|
194 |
+
gt_proposal.objectness_logits = gt_logits
|
195 |
+
|
196 |
+
for key in proposals.get_fields().keys():
|
197 |
+
assert gt_proposal.has(
|
198 |
+
key
|
199 |
+
), "The attribute '{}' in `proposals` does not exist in `gt`".format(key)
|
200 |
+
|
201 |
+
# NOTE: Instances.cat only use fields from the first item. Extra fields in latter items
|
202 |
+
# will be thrown away.
|
203 |
+
new_proposals = Instances.cat([proposals, gt_proposal])
|
204 |
+
|
205 |
+
return new_proposals
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/proposal_generator/rpn.py
ADDED
@@ -0,0 +1,533 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
from typing import Dict, List, Optional, Tuple, Union
|
3 |
+
import torch
|
4 |
+
import torch.nn.functional as F
|
5 |
+
from torch import nn
|
6 |
+
|
7 |
+
from annotator.oneformer.detectron2.config import configurable
|
8 |
+
from annotator.oneformer.detectron2.layers import Conv2d, ShapeSpec, cat
|
9 |
+
from annotator.oneformer.detectron2.structures import Boxes, ImageList, Instances, pairwise_iou
|
10 |
+
from annotator.oneformer.detectron2.utils.events import get_event_storage
|
11 |
+
from annotator.oneformer.detectron2.utils.memory import retry_if_cuda_oom
|
12 |
+
from annotator.oneformer.detectron2.utils.registry import Registry
|
13 |
+
|
14 |
+
from ..anchor_generator import build_anchor_generator
|
15 |
+
from ..box_regression import Box2BoxTransform, _dense_box_regression_loss
|
16 |
+
from ..matcher import Matcher
|
17 |
+
from ..sampling import subsample_labels
|
18 |
+
from .build import PROPOSAL_GENERATOR_REGISTRY
|
19 |
+
from .proposal_utils import find_top_rpn_proposals
|
20 |
+
|
21 |
+
RPN_HEAD_REGISTRY = Registry("RPN_HEAD")
|
22 |
+
RPN_HEAD_REGISTRY.__doc__ = """
|
23 |
+
Registry for RPN heads, which take feature maps and perform
|
24 |
+
objectness classification and bounding box regression for anchors.
|
25 |
+
|
26 |
+
The registered object will be called with `obj(cfg, input_shape)`.
|
27 |
+
The call should return a `nn.Module` object.
|
28 |
+
"""
|
29 |
+
|
30 |
+
|
31 |
+
"""
|
32 |
+
Shape shorthand in this module:
|
33 |
+
|
34 |
+
N: number of images in the minibatch
|
35 |
+
L: number of feature maps per image on which RPN is run
|
36 |
+
A: number of cell anchors (must be the same for all feature maps)
|
37 |
+
Hi, Wi: height and width of the i-th feature map
|
38 |
+
B: size of the box parameterization
|
39 |
+
|
40 |
+
Naming convention:
|
41 |
+
|
42 |
+
objectness: refers to the binary classification of an anchor as object vs. not object.
|
43 |
+
|
44 |
+
deltas: refers to the 4-d (dx, dy, dw, dh) deltas that parameterize the box2box
|
45 |
+
transform (see :class:`box_regression.Box2BoxTransform`), or 5d for rotated boxes.
|
46 |
+
|
47 |
+
pred_objectness_logits: predicted objectness scores in [-inf, +inf]; use
|
48 |
+
sigmoid(pred_objectness_logits) to estimate P(object).
|
49 |
+
|
50 |
+
gt_labels: ground-truth binary classification labels for objectness
|
51 |
+
|
52 |
+
pred_anchor_deltas: predicted box2box transform deltas
|
53 |
+
|
54 |
+
gt_anchor_deltas: ground-truth box2box transform deltas
|
55 |
+
"""
|
56 |
+
|
57 |
+
|
58 |
+
def build_rpn_head(cfg, input_shape):
|
59 |
+
"""
|
60 |
+
Build an RPN head defined by `cfg.MODEL.RPN.HEAD_NAME`.
|
61 |
+
"""
|
62 |
+
name = cfg.MODEL.RPN.HEAD_NAME
|
63 |
+
return RPN_HEAD_REGISTRY.get(name)(cfg, input_shape)
|
64 |
+
|
65 |
+
|
66 |
+
@RPN_HEAD_REGISTRY.register()
|
67 |
+
class StandardRPNHead(nn.Module):
|
68 |
+
"""
|
69 |
+
Standard RPN classification and regression heads described in :paper:`Faster R-CNN`.
|
70 |
+
Uses a 3x3 conv to produce a shared hidden state from which one 1x1 conv predicts
|
71 |
+
objectness logits for each anchor and a second 1x1 conv predicts bounding-box deltas
|
72 |
+
specifying how to deform each anchor into an object proposal.
|
73 |
+
"""
|
74 |
+
|
75 |
+
@configurable
|
76 |
+
def __init__(
|
77 |
+
self, *, in_channels: int, num_anchors: int, box_dim: int = 4, conv_dims: List[int] = (-1,)
|
78 |
+
):
|
79 |
+
"""
|
80 |
+
NOTE: this interface is experimental.
|
81 |
+
|
82 |
+
Args:
|
83 |
+
in_channels (int): number of input feature channels. When using multiple
|
84 |
+
input features, they must have the same number of channels.
|
85 |
+
num_anchors (int): number of anchors to predict for *each spatial position*
|
86 |
+
on the feature map. The total number of anchors for each
|
87 |
+
feature map will be `num_anchors * H * W`.
|
88 |
+
box_dim (int): dimension of a box, which is also the number of box regression
|
89 |
+
predictions to make for each anchor. An axis aligned box has
|
90 |
+
box_dim=4, while a rotated box has box_dim=5.
|
91 |
+
conv_dims (list[int]): a list of integers representing the output channels
|
92 |
+
of N conv layers. Set it to -1 to use the same number of output channels
|
93 |
+
as input channels.
|
94 |
+
"""
|
95 |
+
super().__init__()
|
96 |
+
cur_channels = in_channels
|
97 |
+
# Keeping the old variable names and structure for backwards compatiblity.
|
98 |
+
# Otherwise the old checkpoints will fail to load.
|
99 |
+
if len(conv_dims) == 1:
|
100 |
+
out_channels = cur_channels if conv_dims[0] == -1 else conv_dims[0]
|
101 |
+
# 3x3 conv for the hidden representation
|
102 |
+
self.conv = self._get_rpn_conv(cur_channels, out_channels)
|
103 |
+
cur_channels = out_channels
|
104 |
+
else:
|
105 |
+
self.conv = nn.Sequential()
|
106 |
+
for k, conv_dim in enumerate(conv_dims):
|
107 |
+
out_channels = cur_channels if conv_dim == -1 else conv_dim
|
108 |
+
if out_channels <= 0:
|
109 |
+
raise ValueError(
|
110 |
+
f"Conv output channels should be greater than 0. Got {out_channels}"
|
111 |
+
)
|
112 |
+
conv = self._get_rpn_conv(cur_channels, out_channels)
|
113 |
+
self.conv.add_module(f"conv{k}", conv)
|
114 |
+
cur_channels = out_channels
|
115 |
+
# 1x1 conv for predicting objectness logits
|
116 |
+
self.objectness_logits = nn.Conv2d(cur_channels, num_anchors, kernel_size=1, stride=1)
|
117 |
+
# 1x1 conv for predicting box2box transform deltas
|
118 |
+
self.anchor_deltas = nn.Conv2d(cur_channels, num_anchors * box_dim, kernel_size=1, stride=1)
|
119 |
+
|
120 |
+
# Keeping the order of weights initialization same for backwards compatiblility.
|
121 |
+
for layer in self.modules():
|
122 |
+
if isinstance(layer, nn.Conv2d):
|
123 |
+
nn.init.normal_(layer.weight, std=0.01)
|
124 |
+
nn.init.constant_(layer.bias, 0)
|
125 |
+
|
126 |
+
def _get_rpn_conv(self, in_channels, out_channels):
|
127 |
+
return Conv2d(
|
128 |
+
in_channels,
|
129 |
+
out_channels,
|
130 |
+
kernel_size=3,
|
131 |
+
stride=1,
|
132 |
+
padding=1,
|
133 |
+
activation=nn.ReLU(),
|
134 |
+
)
|
135 |
+
|
136 |
+
@classmethod
|
137 |
+
def from_config(cls, cfg, input_shape):
|
138 |
+
# Standard RPN is shared across levels:
|
139 |
+
in_channels = [s.channels for s in input_shape]
|
140 |
+
assert len(set(in_channels)) == 1, "Each level must have the same channel!"
|
141 |
+
in_channels = in_channels[0]
|
142 |
+
|
143 |
+
# RPNHead should take the same input as anchor generator
|
144 |
+
# NOTE: it assumes that creating an anchor generator does not have unwanted side effect.
|
145 |
+
anchor_generator = build_anchor_generator(cfg, input_shape)
|
146 |
+
num_anchors = anchor_generator.num_anchors
|
147 |
+
box_dim = anchor_generator.box_dim
|
148 |
+
assert (
|
149 |
+
len(set(num_anchors)) == 1
|
150 |
+
), "Each level must have the same number of anchors per spatial position"
|
151 |
+
return {
|
152 |
+
"in_channels": in_channels,
|
153 |
+
"num_anchors": num_anchors[0],
|
154 |
+
"box_dim": box_dim,
|
155 |
+
"conv_dims": cfg.MODEL.RPN.CONV_DIMS,
|
156 |
+
}
|
157 |
+
|
158 |
+
def forward(self, features: List[torch.Tensor]):
|
159 |
+
"""
|
160 |
+
Args:
|
161 |
+
features (list[Tensor]): list of feature maps
|
162 |
+
|
163 |
+
Returns:
|
164 |
+
list[Tensor]: A list of L elements.
|
165 |
+
Element i is a tensor of shape (N, A, Hi, Wi) representing
|
166 |
+
the predicted objectness logits for all anchors. A is the number of cell anchors.
|
167 |
+
list[Tensor]: A list of L elements. Element i is a tensor of shape
|
168 |
+
(N, A*box_dim, Hi, Wi) representing the predicted "deltas" used to transform anchors
|
169 |
+
to proposals.
|
170 |
+
"""
|
171 |
+
pred_objectness_logits = []
|
172 |
+
pred_anchor_deltas = []
|
173 |
+
for x in features:
|
174 |
+
t = self.conv(x)
|
175 |
+
pred_objectness_logits.append(self.objectness_logits(t))
|
176 |
+
pred_anchor_deltas.append(self.anchor_deltas(t))
|
177 |
+
return pred_objectness_logits, pred_anchor_deltas
|
178 |
+
|
179 |
+
|
180 |
+
@PROPOSAL_GENERATOR_REGISTRY.register()
|
181 |
+
class RPN(nn.Module):
|
182 |
+
"""
|
183 |
+
Region Proposal Network, introduced by :paper:`Faster R-CNN`.
|
184 |
+
"""
|
185 |
+
|
186 |
+
@configurable
|
187 |
+
def __init__(
|
188 |
+
self,
|
189 |
+
*,
|
190 |
+
in_features: List[str],
|
191 |
+
head: nn.Module,
|
192 |
+
anchor_generator: nn.Module,
|
193 |
+
anchor_matcher: Matcher,
|
194 |
+
box2box_transform: Box2BoxTransform,
|
195 |
+
batch_size_per_image: int,
|
196 |
+
positive_fraction: float,
|
197 |
+
pre_nms_topk: Tuple[float, float],
|
198 |
+
post_nms_topk: Tuple[float, float],
|
199 |
+
nms_thresh: float = 0.7,
|
200 |
+
min_box_size: float = 0.0,
|
201 |
+
anchor_boundary_thresh: float = -1.0,
|
202 |
+
loss_weight: Union[float, Dict[str, float]] = 1.0,
|
203 |
+
box_reg_loss_type: str = "smooth_l1",
|
204 |
+
smooth_l1_beta: float = 0.0,
|
205 |
+
):
|
206 |
+
"""
|
207 |
+
NOTE: this interface is experimental.
|
208 |
+
|
209 |
+
Args:
|
210 |
+
in_features (list[str]): list of names of input features to use
|
211 |
+
head (nn.Module): a module that predicts logits and regression deltas
|
212 |
+
for each level from a list of per-level features
|
213 |
+
anchor_generator (nn.Module): a module that creates anchors from a
|
214 |
+
list of features. Usually an instance of :class:`AnchorGenerator`
|
215 |
+
anchor_matcher (Matcher): label the anchors by matching them with ground truth.
|
216 |
+
box2box_transform (Box2BoxTransform): defines the transform from anchors boxes to
|
217 |
+
instance boxes
|
218 |
+
batch_size_per_image (int): number of anchors per image to sample for training
|
219 |
+
positive_fraction (float): fraction of foreground anchors to sample for training
|
220 |
+
pre_nms_topk (tuple[float]): (train, test) that represents the
|
221 |
+
number of top k proposals to select before NMS, in
|
222 |
+
training and testing.
|
223 |
+
post_nms_topk (tuple[float]): (train, test) that represents the
|
224 |
+
number of top k proposals to select after NMS, in
|
225 |
+
training and testing.
|
226 |
+
nms_thresh (float): NMS threshold used to de-duplicate the predicted proposals
|
227 |
+
min_box_size (float): remove proposal boxes with any side smaller than this threshold,
|
228 |
+
in the unit of input image pixels
|
229 |
+
anchor_boundary_thresh (float): legacy option
|
230 |
+
loss_weight (float|dict): weights to use for losses. Can be single float for weighting
|
231 |
+
all rpn losses together, or a dict of individual weightings. Valid dict keys are:
|
232 |
+
"loss_rpn_cls" - applied to classification loss
|
233 |
+
"loss_rpn_loc" - applied to box regression loss
|
234 |
+
box_reg_loss_type (str): Loss type to use. Supported losses: "smooth_l1", "giou".
|
235 |
+
smooth_l1_beta (float): beta parameter for the smooth L1 regression loss. Default to
|
236 |
+
use L1 loss. Only used when `box_reg_loss_type` is "smooth_l1"
|
237 |
+
"""
|
238 |
+
super().__init__()
|
239 |
+
self.in_features = in_features
|
240 |
+
self.rpn_head = head
|
241 |
+
self.anchor_generator = anchor_generator
|
242 |
+
self.anchor_matcher = anchor_matcher
|
243 |
+
self.box2box_transform = box2box_transform
|
244 |
+
self.batch_size_per_image = batch_size_per_image
|
245 |
+
self.positive_fraction = positive_fraction
|
246 |
+
# Map from self.training state to train/test settings
|
247 |
+
self.pre_nms_topk = {True: pre_nms_topk[0], False: pre_nms_topk[1]}
|
248 |
+
self.post_nms_topk = {True: post_nms_topk[0], False: post_nms_topk[1]}
|
249 |
+
self.nms_thresh = nms_thresh
|
250 |
+
self.min_box_size = float(min_box_size)
|
251 |
+
self.anchor_boundary_thresh = anchor_boundary_thresh
|
252 |
+
if isinstance(loss_weight, float):
|
253 |
+
loss_weight = {"loss_rpn_cls": loss_weight, "loss_rpn_loc": loss_weight}
|
254 |
+
self.loss_weight = loss_weight
|
255 |
+
self.box_reg_loss_type = box_reg_loss_type
|
256 |
+
self.smooth_l1_beta = smooth_l1_beta
|
257 |
+
|
258 |
+
@classmethod
|
259 |
+
def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
|
260 |
+
in_features = cfg.MODEL.RPN.IN_FEATURES
|
261 |
+
ret = {
|
262 |
+
"in_features": in_features,
|
263 |
+
"min_box_size": cfg.MODEL.PROPOSAL_GENERATOR.MIN_SIZE,
|
264 |
+
"nms_thresh": cfg.MODEL.RPN.NMS_THRESH,
|
265 |
+
"batch_size_per_image": cfg.MODEL.RPN.BATCH_SIZE_PER_IMAGE,
|
266 |
+
"positive_fraction": cfg.MODEL.RPN.POSITIVE_FRACTION,
|
267 |
+
"loss_weight": {
|
268 |
+
"loss_rpn_cls": cfg.MODEL.RPN.LOSS_WEIGHT,
|
269 |
+
"loss_rpn_loc": cfg.MODEL.RPN.BBOX_REG_LOSS_WEIGHT * cfg.MODEL.RPN.LOSS_WEIGHT,
|
270 |
+
},
|
271 |
+
"anchor_boundary_thresh": cfg.MODEL.RPN.BOUNDARY_THRESH,
|
272 |
+
"box2box_transform": Box2BoxTransform(weights=cfg.MODEL.RPN.BBOX_REG_WEIGHTS),
|
273 |
+
"box_reg_loss_type": cfg.MODEL.RPN.BBOX_REG_LOSS_TYPE,
|
274 |
+
"smooth_l1_beta": cfg.MODEL.RPN.SMOOTH_L1_BETA,
|
275 |
+
}
|
276 |
+
|
277 |
+
ret["pre_nms_topk"] = (cfg.MODEL.RPN.PRE_NMS_TOPK_TRAIN, cfg.MODEL.RPN.PRE_NMS_TOPK_TEST)
|
278 |
+
ret["post_nms_topk"] = (cfg.MODEL.RPN.POST_NMS_TOPK_TRAIN, cfg.MODEL.RPN.POST_NMS_TOPK_TEST)
|
279 |
+
|
280 |
+
ret["anchor_generator"] = build_anchor_generator(cfg, [input_shape[f] for f in in_features])
|
281 |
+
ret["anchor_matcher"] = Matcher(
|
282 |
+
cfg.MODEL.RPN.IOU_THRESHOLDS, cfg.MODEL.RPN.IOU_LABELS, allow_low_quality_matches=True
|
283 |
+
)
|
284 |
+
ret["head"] = build_rpn_head(cfg, [input_shape[f] for f in in_features])
|
285 |
+
return ret
|
286 |
+
|
287 |
+
def _subsample_labels(self, label):
|
288 |
+
"""
|
289 |
+
Randomly sample a subset of positive and negative examples, and overwrite
|
290 |
+
the label vector to the ignore value (-1) for all elements that are not
|
291 |
+
included in the sample.
|
292 |
+
|
293 |
+
Args:
|
294 |
+
labels (Tensor): a vector of -1, 0, 1. Will be modified in-place and returned.
|
295 |
+
"""
|
296 |
+
pos_idx, neg_idx = subsample_labels(
|
297 |
+
label, self.batch_size_per_image, self.positive_fraction, 0
|
298 |
+
)
|
299 |
+
# Fill with the ignore label (-1), then set positive and negative labels
|
300 |
+
label.fill_(-1)
|
301 |
+
label.scatter_(0, pos_idx, 1)
|
302 |
+
label.scatter_(0, neg_idx, 0)
|
303 |
+
return label
|
304 |
+
|
305 |
+
@torch.jit.unused
|
306 |
+
@torch.no_grad()
|
307 |
+
def label_and_sample_anchors(
|
308 |
+
self, anchors: List[Boxes], gt_instances: List[Instances]
|
309 |
+
) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
|
310 |
+
"""
|
311 |
+
Args:
|
312 |
+
anchors (list[Boxes]): anchors for each feature map.
|
313 |
+
gt_instances: the ground-truth instances for each image.
|
314 |
+
|
315 |
+
Returns:
|
316 |
+
list[Tensor]:
|
317 |
+
List of #img tensors. i-th element is a vector of labels whose length is
|
318 |
+
the total number of anchors across all feature maps R = sum(Hi * Wi * A).
|
319 |
+
Label values are in {-1, 0, 1}, with meanings: -1 = ignore; 0 = negative
|
320 |
+
class; 1 = positive class.
|
321 |
+
list[Tensor]:
|
322 |
+
i-th element is a Rx4 tensor. The values are the matched gt boxes for each
|
323 |
+
anchor. Values are undefined for those anchors not labeled as 1.
|
324 |
+
"""
|
325 |
+
anchors = Boxes.cat(anchors)
|
326 |
+
|
327 |
+
gt_boxes = [x.gt_boxes for x in gt_instances]
|
328 |
+
image_sizes = [x.image_size for x in gt_instances]
|
329 |
+
del gt_instances
|
330 |
+
|
331 |
+
gt_labels = []
|
332 |
+
matched_gt_boxes = []
|
333 |
+
for image_size_i, gt_boxes_i in zip(image_sizes, gt_boxes):
|
334 |
+
"""
|
335 |
+
image_size_i: (h, w) for the i-th image
|
336 |
+
gt_boxes_i: ground-truth boxes for i-th image
|
337 |
+
"""
|
338 |
+
|
339 |
+
match_quality_matrix = retry_if_cuda_oom(pairwise_iou)(gt_boxes_i, anchors)
|
340 |
+
matched_idxs, gt_labels_i = retry_if_cuda_oom(self.anchor_matcher)(match_quality_matrix)
|
341 |
+
# Matching is memory-expensive and may result in CPU tensors. But the result is small
|
342 |
+
gt_labels_i = gt_labels_i.to(device=gt_boxes_i.device)
|
343 |
+
del match_quality_matrix
|
344 |
+
|
345 |
+
if self.anchor_boundary_thresh >= 0:
|
346 |
+
# Discard anchors that go out of the boundaries of the image
|
347 |
+
# NOTE: This is legacy functionality that is turned off by default in Detectron2
|
348 |
+
anchors_inside_image = anchors.inside_box(image_size_i, self.anchor_boundary_thresh)
|
349 |
+
gt_labels_i[~anchors_inside_image] = -1
|
350 |
+
|
351 |
+
# A vector of labels (-1, 0, 1) for each anchor
|
352 |
+
gt_labels_i = self._subsample_labels(gt_labels_i)
|
353 |
+
|
354 |
+
if len(gt_boxes_i) == 0:
|
355 |
+
# These values won't be used anyway since the anchor is labeled as background
|
356 |
+
matched_gt_boxes_i = torch.zeros_like(anchors.tensor)
|
357 |
+
else:
|
358 |
+
# TODO wasted indexing computation for ignored boxes
|
359 |
+
matched_gt_boxes_i = gt_boxes_i[matched_idxs].tensor
|
360 |
+
|
361 |
+
gt_labels.append(gt_labels_i) # N,AHW
|
362 |
+
matched_gt_boxes.append(matched_gt_boxes_i)
|
363 |
+
return gt_labels, matched_gt_boxes
|
364 |
+
|
365 |
+
@torch.jit.unused
|
366 |
+
def losses(
|
367 |
+
self,
|
368 |
+
anchors: List[Boxes],
|
369 |
+
pred_objectness_logits: List[torch.Tensor],
|
370 |
+
gt_labels: List[torch.Tensor],
|
371 |
+
pred_anchor_deltas: List[torch.Tensor],
|
372 |
+
gt_boxes: List[torch.Tensor],
|
373 |
+
) -> Dict[str, torch.Tensor]:
|
374 |
+
"""
|
375 |
+
Return the losses from a set of RPN predictions and their associated ground-truth.
|
376 |
+
|
377 |
+
Args:
|
378 |
+
anchors (list[Boxes or RotatedBoxes]): anchors for each feature map, each
|
379 |
+
has shape (Hi*Wi*A, B), where B is box dimension (4 or 5).
|
380 |
+
pred_objectness_logits (list[Tensor]): A list of L elements.
|
381 |
+
Element i is a tensor of shape (N, Hi*Wi*A) representing
|
382 |
+
the predicted objectness logits for all anchors.
|
383 |
+
gt_labels (list[Tensor]): Output of :meth:`label_and_sample_anchors`.
|
384 |
+
pred_anchor_deltas (list[Tensor]): A list of L elements. Element i is a tensor of shape
|
385 |
+
(N, Hi*Wi*A, 4 or 5) representing the predicted "deltas" used to transform anchors
|
386 |
+
to proposals.
|
387 |
+
gt_boxes (list[Tensor]): Output of :meth:`label_and_sample_anchors`.
|
388 |
+
|
389 |
+
Returns:
|
390 |
+
dict[loss name -> loss value]: A dict mapping from loss name to loss value.
|
391 |
+
Loss names are: `loss_rpn_cls` for objectness classification and
|
392 |
+
`loss_rpn_loc` for proposal localization.
|
393 |
+
"""
|
394 |
+
num_images = len(gt_labels)
|
395 |
+
gt_labels = torch.stack(gt_labels) # (N, sum(Hi*Wi*Ai))
|
396 |
+
|
397 |
+
# Log the number of positive/negative anchors per-image that's used in training
|
398 |
+
pos_mask = gt_labels == 1
|
399 |
+
num_pos_anchors = pos_mask.sum().item()
|
400 |
+
num_neg_anchors = (gt_labels == 0).sum().item()
|
401 |
+
storage = get_event_storage()
|
402 |
+
storage.put_scalar("rpn/num_pos_anchors", num_pos_anchors / num_images)
|
403 |
+
storage.put_scalar("rpn/num_neg_anchors", num_neg_anchors / num_images)
|
404 |
+
|
405 |
+
localization_loss = _dense_box_regression_loss(
|
406 |
+
anchors,
|
407 |
+
self.box2box_transform,
|
408 |
+
pred_anchor_deltas,
|
409 |
+
gt_boxes,
|
410 |
+
pos_mask,
|
411 |
+
box_reg_loss_type=self.box_reg_loss_type,
|
412 |
+
smooth_l1_beta=self.smooth_l1_beta,
|
413 |
+
)
|
414 |
+
|
415 |
+
valid_mask = gt_labels >= 0
|
416 |
+
objectness_loss = F.binary_cross_entropy_with_logits(
|
417 |
+
cat(pred_objectness_logits, dim=1)[valid_mask],
|
418 |
+
gt_labels[valid_mask].to(torch.float32),
|
419 |
+
reduction="sum",
|
420 |
+
)
|
421 |
+
normalizer = self.batch_size_per_image * num_images
|
422 |
+
losses = {
|
423 |
+
"loss_rpn_cls": objectness_loss / normalizer,
|
424 |
+
# The original Faster R-CNN paper uses a slightly different normalizer
|
425 |
+
# for loc loss. But it doesn't matter in practice
|
426 |
+
"loss_rpn_loc": localization_loss / normalizer,
|
427 |
+
}
|
428 |
+
losses = {k: v * self.loss_weight.get(k, 1.0) for k, v in losses.items()}
|
429 |
+
return losses
|
430 |
+
|
431 |
+
def forward(
|
432 |
+
self,
|
433 |
+
images: ImageList,
|
434 |
+
features: Dict[str, torch.Tensor],
|
435 |
+
gt_instances: Optional[List[Instances]] = None,
|
436 |
+
):
|
437 |
+
"""
|
438 |
+
Args:
|
439 |
+
images (ImageList): input images of length `N`
|
440 |
+
features (dict[str, Tensor]): input data as a mapping from feature
|
441 |
+
map name to tensor. Axis 0 represents the number of images `N` in
|
442 |
+
the input data; axes 1-3 are channels, height, and width, which may
|
443 |
+
vary between feature maps (e.g., if a feature pyramid is used).
|
444 |
+
gt_instances (list[Instances], optional): a length `N` list of `Instances`s.
|
445 |
+
Each `Instances` stores ground-truth instances for the corresponding image.
|
446 |
+
|
447 |
+
Returns:
|
448 |
+
proposals: list[Instances]: contains fields "proposal_boxes", "objectness_logits"
|
449 |
+
loss: dict[Tensor] or None
|
450 |
+
"""
|
451 |
+
features = [features[f] for f in self.in_features]
|
452 |
+
anchors = self.anchor_generator(features)
|
453 |
+
|
454 |
+
pred_objectness_logits, pred_anchor_deltas = self.rpn_head(features)
|
455 |
+
# Transpose the Hi*Wi*A dimension to the middle:
|
456 |
+
pred_objectness_logits = [
|
457 |
+
# (N, A, Hi, Wi) -> (N, Hi, Wi, A) -> (N, Hi*Wi*A)
|
458 |
+
score.permute(0, 2, 3, 1).flatten(1)
|
459 |
+
for score in pred_objectness_logits
|
460 |
+
]
|
461 |
+
pred_anchor_deltas = [
|
462 |
+
# (N, A*B, Hi, Wi) -> (N, A, B, Hi, Wi) -> (N, Hi, Wi, A, B) -> (N, Hi*Wi*A, B)
|
463 |
+
x.view(x.shape[0], -1, self.anchor_generator.box_dim, x.shape[-2], x.shape[-1])
|
464 |
+
.permute(0, 3, 4, 1, 2)
|
465 |
+
.flatten(1, -2)
|
466 |
+
for x in pred_anchor_deltas
|
467 |
+
]
|
468 |
+
|
469 |
+
if self.training:
|
470 |
+
assert gt_instances is not None, "RPN requires gt_instances in training!"
|
471 |
+
gt_labels, gt_boxes = self.label_and_sample_anchors(anchors, gt_instances)
|
472 |
+
losses = self.losses(
|
473 |
+
anchors, pred_objectness_logits, gt_labels, pred_anchor_deltas, gt_boxes
|
474 |
+
)
|
475 |
+
else:
|
476 |
+
losses = {}
|
477 |
+
proposals = self.predict_proposals(
|
478 |
+
anchors, pred_objectness_logits, pred_anchor_deltas, images.image_sizes
|
479 |
+
)
|
480 |
+
return proposals, losses
|
481 |
+
|
482 |
+
def predict_proposals(
|
483 |
+
self,
|
484 |
+
anchors: List[Boxes],
|
485 |
+
pred_objectness_logits: List[torch.Tensor],
|
486 |
+
pred_anchor_deltas: List[torch.Tensor],
|
487 |
+
image_sizes: List[Tuple[int, int]],
|
488 |
+
):
|
489 |
+
"""
|
490 |
+
Decode all the predicted box regression deltas to proposals. Find the top proposals
|
491 |
+
by applying NMS and removing boxes that are too small.
|
492 |
+
|
493 |
+
Returns:
|
494 |
+
proposals (list[Instances]): list of N Instances. The i-th Instances
|
495 |
+
stores post_nms_topk object proposals for image i, sorted by their
|
496 |
+
objectness score in descending order.
|
497 |
+
"""
|
498 |
+
# The proposals are treated as fixed for joint training with roi heads.
|
499 |
+
# This approach ignores the derivative w.r.t. the proposal boxes’ coordinates that
|
500 |
+
# are also network responses.
|
501 |
+
with torch.no_grad():
|
502 |
+
pred_proposals = self._decode_proposals(anchors, pred_anchor_deltas)
|
503 |
+
return find_top_rpn_proposals(
|
504 |
+
pred_proposals,
|
505 |
+
pred_objectness_logits,
|
506 |
+
image_sizes,
|
507 |
+
self.nms_thresh,
|
508 |
+
self.pre_nms_topk[self.training],
|
509 |
+
self.post_nms_topk[self.training],
|
510 |
+
self.min_box_size,
|
511 |
+
self.training,
|
512 |
+
)
|
513 |
+
|
514 |
+
def _decode_proposals(self, anchors: List[Boxes], pred_anchor_deltas: List[torch.Tensor]):
|
515 |
+
"""
|
516 |
+
Transform anchors into proposals by applying the predicted anchor deltas.
|
517 |
+
|
518 |
+
Returns:
|
519 |
+
proposals (list[Tensor]): A list of L tensors. Tensor i has shape
|
520 |
+
(N, Hi*Wi*A, B)
|
521 |
+
"""
|
522 |
+
N = pred_anchor_deltas[0].shape[0]
|
523 |
+
proposals = []
|
524 |
+
# For each feature map
|
525 |
+
for anchors_i, pred_anchor_deltas_i in zip(anchors, pred_anchor_deltas):
|
526 |
+
B = anchors_i.tensor.size(1)
|
527 |
+
pred_anchor_deltas_i = pred_anchor_deltas_i.reshape(-1, B)
|
528 |
+
# Expand anchors to shape (N*Hi*Wi*A, B)
|
529 |
+
anchors_i = anchors_i.tensor.unsqueeze(0).expand(N, -1, -1).reshape(-1, B)
|
530 |
+
proposals_i = self.box2box_transform.apply_deltas(pred_anchor_deltas_i, anchors_i)
|
531 |
+
# Append feature map proposals with shape (N, Hi*Wi*A, B)
|
532 |
+
proposals.append(proposals_i.view(N, -1, B))
|
533 |
+
return proposals
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/proposal_generator/rrpn.py
ADDED
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
import itertools
|
3 |
+
import logging
|
4 |
+
from typing import Dict, List
|
5 |
+
import torch
|
6 |
+
|
7 |
+
from annotator.oneformer.detectron2.config import configurable
|
8 |
+
from annotator.oneformer.detectron2.layers import ShapeSpec, batched_nms_rotated, cat
|
9 |
+
from annotator.oneformer.detectron2.structures import Instances, RotatedBoxes, pairwise_iou_rotated
|
10 |
+
from annotator.oneformer.detectron2.utils.memory import retry_if_cuda_oom
|
11 |
+
|
12 |
+
from ..box_regression import Box2BoxTransformRotated
|
13 |
+
from .build import PROPOSAL_GENERATOR_REGISTRY
|
14 |
+
from .proposal_utils import _is_tracing
|
15 |
+
from .rpn import RPN
|
16 |
+
|
17 |
+
logger = logging.getLogger(__name__)
|
18 |
+
|
19 |
+
|
20 |
+
def find_top_rrpn_proposals(
|
21 |
+
proposals,
|
22 |
+
pred_objectness_logits,
|
23 |
+
image_sizes,
|
24 |
+
nms_thresh,
|
25 |
+
pre_nms_topk,
|
26 |
+
post_nms_topk,
|
27 |
+
min_box_size,
|
28 |
+
training,
|
29 |
+
):
|
30 |
+
"""
|
31 |
+
For each feature map, select the `pre_nms_topk` highest scoring proposals,
|
32 |
+
apply NMS, clip proposals, and remove small boxes. Return the `post_nms_topk`
|
33 |
+
highest scoring proposals among all the feature maps if `training` is True,
|
34 |
+
otherwise, returns the highest `post_nms_topk` scoring proposals for each
|
35 |
+
feature map.
|
36 |
+
|
37 |
+
Args:
|
38 |
+
proposals (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A, 5).
|
39 |
+
All proposal predictions on the feature maps.
|
40 |
+
pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A).
|
41 |
+
image_sizes (list[tuple]): sizes (h, w) for each image
|
42 |
+
nms_thresh (float): IoU threshold to use for NMS
|
43 |
+
pre_nms_topk (int): number of top k scoring proposals to keep before applying NMS.
|
44 |
+
When RRPN is run on multiple feature maps (as in FPN) this number is per
|
45 |
+
feature map.
|
46 |
+
post_nms_topk (int): number of top k scoring proposals to keep after applying NMS.
|
47 |
+
When RRPN is run on multiple feature maps (as in FPN) this number is total,
|
48 |
+
over all feature maps.
|
49 |
+
min_box_size(float): minimum proposal box side length in pixels (absolute units wrt
|
50 |
+
input images).
|
51 |
+
training (bool): True if proposals are to be used in training, otherwise False.
|
52 |
+
This arg exists only to support a legacy bug; look for the "NB: Legacy bug ..."
|
53 |
+
comment.
|
54 |
+
|
55 |
+
Returns:
|
56 |
+
proposals (list[Instances]): list of N Instances. The i-th Instances
|
57 |
+
stores post_nms_topk object proposals for image i.
|
58 |
+
"""
|
59 |
+
num_images = len(image_sizes)
|
60 |
+
device = proposals[0].device
|
61 |
+
|
62 |
+
# 1. Select top-k anchor for every level and every image
|
63 |
+
topk_scores = [] # #lvl Tensor, each of shape N x topk
|
64 |
+
topk_proposals = []
|
65 |
+
level_ids = [] # #lvl Tensor, each of shape (topk,)
|
66 |
+
batch_idx = torch.arange(num_images, device=device)
|
67 |
+
for level_id, proposals_i, logits_i in zip(
|
68 |
+
itertools.count(), proposals, pred_objectness_logits
|
69 |
+
):
|
70 |
+
Hi_Wi_A = logits_i.shape[1]
|
71 |
+
if isinstance(Hi_Wi_A, torch.Tensor): # it's a tensor in tracing
|
72 |
+
num_proposals_i = torch.clamp(Hi_Wi_A, max=pre_nms_topk)
|
73 |
+
else:
|
74 |
+
num_proposals_i = min(Hi_Wi_A, pre_nms_topk)
|
75 |
+
|
76 |
+
topk_scores_i, topk_idx = logits_i.topk(num_proposals_i, dim=1)
|
77 |
+
|
78 |
+
# each is N x topk
|
79 |
+
topk_proposals_i = proposals_i[batch_idx[:, None], topk_idx] # N x topk x 5
|
80 |
+
|
81 |
+
topk_proposals.append(topk_proposals_i)
|
82 |
+
topk_scores.append(topk_scores_i)
|
83 |
+
level_ids.append(torch.full((num_proposals_i,), level_id, dtype=torch.int64, device=device))
|
84 |
+
|
85 |
+
# 2. Concat all levels together
|
86 |
+
topk_scores = cat(topk_scores, dim=1)
|
87 |
+
topk_proposals = cat(topk_proposals, dim=1)
|
88 |
+
level_ids = cat(level_ids, dim=0)
|
89 |
+
|
90 |
+
# 3. For each image, run a per-level NMS, and choose topk results.
|
91 |
+
results = []
|
92 |
+
for n, image_size in enumerate(image_sizes):
|
93 |
+
boxes = RotatedBoxes(topk_proposals[n])
|
94 |
+
scores_per_img = topk_scores[n]
|
95 |
+
lvl = level_ids
|
96 |
+
|
97 |
+
valid_mask = torch.isfinite(boxes.tensor).all(dim=1) & torch.isfinite(scores_per_img)
|
98 |
+
if not valid_mask.all():
|
99 |
+
if training:
|
100 |
+
raise FloatingPointError(
|
101 |
+
"Predicted boxes or scores contain Inf/NaN. Training has diverged."
|
102 |
+
)
|
103 |
+
boxes = boxes[valid_mask]
|
104 |
+
scores_per_img = scores_per_img[valid_mask]
|
105 |
+
lvl = lvl[valid_mask]
|
106 |
+
boxes.clip(image_size)
|
107 |
+
|
108 |
+
# filter empty boxes
|
109 |
+
keep = boxes.nonempty(threshold=min_box_size)
|
110 |
+
if _is_tracing() or keep.sum().item() != len(boxes):
|
111 |
+
boxes, scores_per_img, lvl = (boxes[keep], scores_per_img[keep], lvl[keep])
|
112 |
+
|
113 |
+
keep = batched_nms_rotated(boxes.tensor, scores_per_img, lvl, nms_thresh)
|
114 |
+
# In Detectron1, there was different behavior during training vs. testing.
|
115 |
+
# (https://github.com/facebookresearch/Detectron/issues/459)
|
116 |
+
# During training, topk is over the proposals from *all* images in the training batch.
|
117 |
+
# During testing, it is over the proposals for each image separately.
|
118 |
+
# As a result, the training behavior becomes batch-dependent,
|
119 |
+
# and the configuration "POST_NMS_TOPK_TRAIN" end up relying on the batch size.
|
120 |
+
# This bug is addressed in Detectron2 to make the behavior independent of batch size.
|
121 |
+
keep = keep[:post_nms_topk]
|
122 |
+
|
123 |
+
res = Instances(image_size)
|
124 |
+
res.proposal_boxes = boxes[keep]
|
125 |
+
res.objectness_logits = scores_per_img[keep]
|
126 |
+
results.append(res)
|
127 |
+
return results
|
128 |
+
|
129 |
+
|
130 |
+
@PROPOSAL_GENERATOR_REGISTRY.register()
|
131 |
+
class RRPN(RPN):
|
132 |
+
"""
|
133 |
+
Rotated Region Proposal Network described in :paper:`RRPN`.
|
134 |
+
"""
|
135 |
+
|
136 |
+
@configurable
|
137 |
+
def __init__(self, *args, **kwargs):
|
138 |
+
super().__init__(*args, **kwargs)
|
139 |
+
if self.anchor_boundary_thresh >= 0:
|
140 |
+
raise NotImplementedError(
|
141 |
+
"anchor_boundary_thresh is a legacy option not implemented for RRPN."
|
142 |
+
)
|
143 |
+
|
144 |
+
@classmethod
|
145 |
+
def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
|
146 |
+
ret = super().from_config(cfg, input_shape)
|
147 |
+
ret["box2box_transform"] = Box2BoxTransformRotated(weights=cfg.MODEL.RPN.BBOX_REG_WEIGHTS)
|
148 |
+
return ret
|
149 |
+
|
150 |
+
@torch.no_grad()
|
151 |
+
def label_and_sample_anchors(self, anchors: List[RotatedBoxes], gt_instances: List[Instances]):
|
152 |
+
"""
|
153 |
+
Args:
|
154 |
+
anchors (list[RotatedBoxes]): anchors for each feature map.
|
155 |
+
gt_instances: the ground-truth instances for each image.
|
156 |
+
|
157 |
+
Returns:
|
158 |
+
list[Tensor]:
|
159 |
+
List of #img tensors. i-th element is a vector of labels whose length is
|
160 |
+
the total number of anchors across feature maps. Label values are in {-1, 0, 1},
|
161 |
+
with meanings: -1 = ignore; 0 = negative class; 1 = positive class.
|
162 |
+
list[Tensor]:
|
163 |
+
i-th element is a Nx5 tensor, where N is the total number of anchors across
|
164 |
+
feature maps. The values are the matched gt boxes for each anchor.
|
165 |
+
Values are undefined for those anchors not labeled as 1.
|
166 |
+
"""
|
167 |
+
anchors = RotatedBoxes.cat(anchors)
|
168 |
+
|
169 |
+
gt_boxes = [x.gt_boxes for x in gt_instances]
|
170 |
+
del gt_instances
|
171 |
+
|
172 |
+
gt_labels = []
|
173 |
+
matched_gt_boxes = []
|
174 |
+
for gt_boxes_i in gt_boxes:
|
175 |
+
"""
|
176 |
+
gt_boxes_i: ground-truth boxes for i-th image
|
177 |
+
"""
|
178 |
+
match_quality_matrix = retry_if_cuda_oom(pairwise_iou_rotated)(gt_boxes_i, anchors)
|
179 |
+
matched_idxs, gt_labels_i = retry_if_cuda_oom(self.anchor_matcher)(match_quality_matrix)
|
180 |
+
# Matching is memory-expensive and may result in CPU tensors. But the result is small
|
181 |
+
gt_labels_i = gt_labels_i.to(device=gt_boxes_i.device)
|
182 |
+
|
183 |
+
# A vector of labels (-1, 0, 1) for each anchor
|
184 |
+
gt_labels_i = self._subsample_labels(gt_labels_i)
|
185 |
+
|
186 |
+
if len(gt_boxes_i) == 0:
|
187 |
+
# These values won't be used anyway since the anchor is labeled as background
|
188 |
+
matched_gt_boxes_i = torch.zeros_like(anchors.tensor)
|
189 |
+
else:
|
190 |
+
# TODO wasted indexing computation for ignored boxes
|
191 |
+
matched_gt_boxes_i = gt_boxes_i[matched_idxs].tensor
|
192 |
+
|
193 |
+
gt_labels.append(gt_labels_i) # N,AHW
|
194 |
+
matched_gt_boxes.append(matched_gt_boxes_i)
|
195 |
+
return gt_labels, matched_gt_boxes
|
196 |
+
|
197 |
+
@torch.no_grad()
|
198 |
+
def predict_proposals(self, anchors, pred_objectness_logits, pred_anchor_deltas, image_sizes):
|
199 |
+
pred_proposals = self._decode_proposals(anchors, pred_anchor_deltas)
|
200 |
+
return find_top_rrpn_proposals(
|
201 |
+
pred_proposals,
|
202 |
+
pred_objectness_logits,
|
203 |
+
image_sizes,
|
204 |
+
self.nms_thresh,
|
205 |
+
self.pre_nms_topk[self.training],
|
206 |
+
self.post_nms_topk[self.training],
|
207 |
+
self.min_box_size,
|
208 |
+
self.training,
|
209 |
+
)
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/roi_heads/__init__.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
from .box_head import ROI_BOX_HEAD_REGISTRY, build_box_head, FastRCNNConvFCHead
|
3 |
+
from .keypoint_head import (
|
4 |
+
ROI_KEYPOINT_HEAD_REGISTRY,
|
5 |
+
build_keypoint_head,
|
6 |
+
BaseKeypointRCNNHead,
|
7 |
+
KRCNNConvDeconvUpsampleHead,
|
8 |
+
)
|
9 |
+
from .mask_head import (
|
10 |
+
ROI_MASK_HEAD_REGISTRY,
|
11 |
+
build_mask_head,
|
12 |
+
BaseMaskRCNNHead,
|
13 |
+
MaskRCNNConvUpsampleHead,
|
14 |
+
)
|
15 |
+
from .roi_heads import (
|
16 |
+
ROI_HEADS_REGISTRY,
|
17 |
+
ROIHeads,
|
18 |
+
Res5ROIHeads,
|
19 |
+
StandardROIHeads,
|
20 |
+
build_roi_heads,
|
21 |
+
select_foreground_proposals,
|
22 |
+
)
|
23 |
+
from .cascade_rcnn import CascadeROIHeads
|
24 |
+
from .rotated_fast_rcnn import RROIHeads
|
25 |
+
from .fast_rcnn import FastRCNNOutputLayers
|
26 |
+
|
27 |
+
from . import cascade_rcnn # isort:skip
|
28 |
+
|
29 |
+
__all__ = list(globals().keys())
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/roi_heads/box_head.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
import numpy as np
|
3 |
+
from typing import List
|
4 |
+
import fvcore.nn.weight_init as weight_init
|
5 |
+
import torch
|
6 |
+
from torch import nn
|
7 |
+
|
8 |
+
from annotator.oneformer.detectron2.config import configurable
|
9 |
+
from annotator.oneformer.detectron2.layers import Conv2d, ShapeSpec, get_norm
|
10 |
+
from annotator.oneformer.detectron2.utils.registry import Registry
|
11 |
+
|
12 |
+
__all__ = ["FastRCNNConvFCHead", "build_box_head", "ROI_BOX_HEAD_REGISTRY"]
|
13 |
+
|
14 |
+
ROI_BOX_HEAD_REGISTRY = Registry("ROI_BOX_HEAD")
|
15 |
+
ROI_BOX_HEAD_REGISTRY.__doc__ = """
|
16 |
+
Registry for box heads, which make box predictions from per-region features.
|
17 |
+
|
18 |
+
The registered object will be called with `obj(cfg, input_shape)`.
|
19 |
+
"""
|
20 |
+
|
21 |
+
|
22 |
+
# To get torchscript support, we make the head a subclass of `nn.Sequential`.
|
23 |
+
# Therefore, to add new layers in this head class, please make sure they are
|
24 |
+
# added in the order they will be used in forward().
|
25 |
+
@ROI_BOX_HEAD_REGISTRY.register()
|
26 |
+
class FastRCNNConvFCHead(nn.Sequential):
|
27 |
+
"""
|
28 |
+
A head with several 3x3 conv layers (each followed by norm & relu) and then
|
29 |
+
several fc layers (each followed by relu).
|
30 |
+
"""
|
31 |
+
|
32 |
+
@configurable
|
33 |
+
def __init__(
|
34 |
+
self, input_shape: ShapeSpec, *, conv_dims: List[int], fc_dims: List[int], conv_norm=""
|
35 |
+
):
|
36 |
+
"""
|
37 |
+
NOTE: this interface is experimental.
|
38 |
+
|
39 |
+
Args:
|
40 |
+
input_shape (ShapeSpec): shape of the input feature.
|
41 |
+
conv_dims (list[int]): the output dimensions of the conv layers
|
42 |
+
fc_dims (list[int]): the output dimensions of the fc layers
|
43 |
+
conv_norm (str or callable): normalization for the conv layers.
|
44 |
+
See :func:`detectron2.layers.get_norm` for supported types.
|
45 |
+
"""
|
46 |
+
super().__init__()
|
47 |
+
assert len(conv_dims) + len(fc_dims) > 0
|
48 |
+
|
49 |
+
self._output_size = (input_shape.channels, input_shape.height, input_shape.width)
|
50 |
+
|
51 |
+
self.conv_norm_relus = []
|
52 |
+
for k, conv_dim in enumerate(conv_dims):
|
53 |
+
conv = Conv2d(
|
54 |
+
self._output_size[0],
|
55 |
+
conv_dim,
|
56 |
+
kernel_size=3,
|
57 |
+
padding=1,
|
58 |
+
bias=not conv_norm,
|
59 |
+
norm=get_norm(conv_norm, conv_dim),
|
60 |
+
activation=nn.ReLU(),
|
61 |
+
)
|
62 |
+
self.add_module("conv{}".format(k + 1), conv)
|
63 |
+
self.conv_norm_relus.append(conv)
|
64 |
+
self._output_size = (conv_dim, self._output_size[1], self._output_size[2])
|
65 |
+
|
66 |
+
self.fcs = []
|
67 |
+
for k, fc_dim in enumerate(fc_dims):
|
68 |
+
if k == 0:
|
69 |
+
self.add_module("flatten", nn.Flatten())
|
70 |
+
fc = nn.Linear(int(np.prod(self._output_size)), fc_dim)
|
71 |
+
self.add_module("fc{}".format(k + 1), fc)
|
72 |
+
self.add_module("fc_relu{}".format(k + 1), nn.ReLU())
|
73 |
+
self.fcs.append(fc)
|
74 |
+
self._output_size = fc_dim
|
75 |
+
|
76 |
+
for layer in self.conv_norm_relus:
|
77 |
+
weight_init.c2_msra_fill(layer)
|
78 |
+
for layer in self.fcs:
|
79 |
+
weight_init.c2_xavier_fill(layer)
|
80 |
+
|
81 |
+
@classmethod
|
82 |
+
def from_config(cls, cfg, input_shape):
|
83 |
+
num_conv = cfg.MODEL.ROI_BOX_HEAD.NUM_CONV
|
84 |
+
conv_dim = cfg.MODEL.ROI_BOX_HEAD.CONV_DIM
|
85 |
+
num_fc = cfg.MODEL.ROI_BOX_HEAD.NUM_FC
|
86 |
+
fc_dim = cfg.MODEL.ROI_BOX_HEAD.FC_DIM
|
87 |
+
return {
|
88 |
+
"input_shape": input_shape,
|
89 |
+
"conv_dims": [conv_dim] * num_conv,
|
90 |
+
"fc_dims": [fc_dim] * num_fc,
|
91 |
+
"conv_norm": cfg.MODEL.ROI_BOX_HEAD.NORM,
|
92 |
+
}
|
93 |
+
|
94 |
+
def forward(self, x):
|
95 |
+
for layer in self:
|
96 |
+
x = layer(x)
|
97 |
+
return x
|
98 |
+
|
99 |
+
@property
|
100 |
+
@torch.jit.unused
|
101 |
+
def output_shape(self):
|
102 |
+
"""
|
103 |
+
Returns:
|
104 |
+
ShapeSpec: the output feature shape
|
105 |
+
"""
|
106 |
+
o = self._output_size
|
107 |
+
if isinstance(o, int):
|
108 |
+
return ShapeSpec(channels=o)
|
109 |
+
else:
|
110 |
+
return ShapeSpec(channels=o[0], height=o[1], width=o[2])
|
111 |
+
|
112 |
+
|
113 |
+
def build_box_head(cfg, input_shape):
|
114 |
+
"""
|
115 |
+
Build a box head defined by `cfg.MODEL.ROI_BOX_HEAD.NAME`.
|
116 |
+
"""
|
117 |
+
name = cfg.MODEL.ROI_BOX_HEAD.NAME
|
118 |
+
return ROI_BOX_HEAD_REGISTRY.get(name)(cfg, input_shape)
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/roi_heads/cascade_rcnn.py
ADDED
@@ -0,0 +1,299 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
from typing import List
|
3 |
+
import torch
|
4 |
+
from torch import nn
|
5 |
+
from torch.autograd.function import Function
|
6 |
+
|
7 |
+
from annotator.oneformer.detectron2.config import configurable
|
8 |
+
from annotator.oneformer.detectron2.layers import ShapeSpec
|
9 |
+
from annotator.oneformer.detectron2.structures import Boxes, Instances, pairwise_iou
|
10 |
+
from annotator.oneformer.detectron2.utils.events import get_event_storage
|
11 |
+
|
12 |
+
from ..box_regression import Box2BoxTransform
|
13 |
+
from ..matcher import Matcher
|
14 |
+
from ..poolers import ROIPooler
|
15 |
+
from .box_head import build_box_head
|
16 |
+
from .fast_rcnn import FastRCNNOutputLayers, fast_rcnn_inference
|
17 |
+
from .roi_heads import ROI_HEADS_REGISTRY, StandardROIHeads
|
18 |
+
|
19 |
+
|
20 |
+
class _ScaleGradient(Function):
|
21 |
+
@staticmethod
|
22 |
+
def forward(ctx, input, scale):
|
23 |
+
ctx.scale = scale
|
24 |
+
return input
|
25 |
+
|
26 |
+
@staticmethod
|
27 |
+
def backward(ctx, grad_output):
|
28 |
+
return grad_output * ctx.scale, None
|
29 |
+
|
30 |
+
|
31 |
+
@ROI_HEADS_REGISTRY.register()
|
32 |
+
class CascadeROIHeads(StandardROIHeads):
|
33 |
+
"""
|
34 |
+
The ROI heads that implement :paper:`Cascade R-CNN`.
|
35 |
+
"""
|
36 |
+
|
37 |
+
@configurable
|
38 |
+
def __init__(
|
39 |
+
self,
|
40 |
+
*,
|
41 |
+
box_in_features: List[str],
|
42 |
+
box_pooler: ROIPooler,
|
43 |
+
box_heads: List[nn.Module],
|
44 |
+
box_predictors: List[nn.Module],
|
45 |
+
proposal_matchers: List[Matcher],
|
46 |
+
**kwargs,
|
47 |
+
):
|
48 |
+
"""
|
49 |
+
NOTE: this interface is experimental.
|
50 |
+
|
51 |
+
Args:
|
52 |
+
box_pooler (ROIPooler): pooler that extracts region features from given boxes
|
53 |
+
box_heads (list[nn.Module]): box head for each cascade stage
|
54 |
+
box_predictors (list[nn.Module]): box predictor for each cascade stage
|
55 |
+
proposal_matchers (list[Matcher]): matcher with different IoU thresholds to
|
56 |
+
match boxes with ground truth for each stage. The first matcher matches
|
57 |
+
RPN proposals with ground truth, the other matchers use boxes predicted
|
58 |
+
by the previous stage as proposals and match them with ground truth.
|
59 |
+
"""
|
60 |
+
assert "proposal_matcher" not in kwargs, (
|
61 |
+
"CascadeROIHeads takes 'proposal_matchers=' for each stage instead "
|
62 |
+
"of one 'proposal_matcher='."
|
63 |
+
)
|
64 |
+
# The first matcher matches RPN proposals with ground truth, done in the base class
|
65 |
+
kwargs["proposal_matcher"] = proposal_matchers[0]
|
66 |
+
num_stages = self.num_cascade_stages = len(box_heads)
|
67 |
+
box_heads = nn.ModuleList(box_heads)
|
68 |
+
box_predictors = nn.ModuleList(box_predictors)
|
69 |
+
assert len(box_predictors) == num_stages, f"{len(box_predictors)} != {num_stages}!"
|
70 |
+
assert len(proposal_matchers) == num_stages, f"{len(proposal_matchers)} != {num_stages}!"
|
71 |
+
super().__init__(
|
72 |
+
box_in_features=box_in_features,
|
73 |
+
box_pooler=box_pooler,
|
74 |
+
box_head=box_heads,
|
75 |
+
box_predictor=box_predictors,
|
76 |
+
**kwargs,
|
77 |
+
)
|
78 |
+
self.proposal_matchers = proposal_matchers
|
79 |
+
|
80 |
+
@classmethod
|
81 |
+
def from_config(cls, cfg, input_shape):
|
82 |
+
ret = super().from_config(cfg, input_shape)
|
83 |
+
ret.pop("proposal_matcher")
|
84 |
+
return ret
|
85 |
+
|
86 |
+
@classmethod
|
87 |
+
def _init_box_head(cls, cfg, input_shape):
|
88 |
+
# fmt: off
|
89 |
+
in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES
|
90 |
+
pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
|
91 |
+
pooler_scales = tuple(1.0 / input_shape[k].stride for k in in_features)
|
92 |
+
sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
|
93 |
+
pooler_type = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE
|
94 |
+
cascade_bbox_reg_weights = cfg.MODEL.ROI_BOX_CASCADE_HEAD.BBOX_REG_WEIGHTS
|
95 |
+
cascade_ious = cfg.MODEL.ROI_BOX_CASCADE_HEAD.IOUS
|
96 |
+
assert len(cascade_bbox_reg_weights) == len(cascade_ious)
|
97 |
+
assert cfg.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG, \
|
98 |
+
"CascadeROIHeads only support class-agnostic regression now!"
|
99 |
+
assert cascade_ious[0] == cfg.MODEL.ROI_HEADS.IOU_THRESHOLDS[0]
|
100 |
+
# fmt: on
|
101 |
+
|
102 |
+
in_channels = [input_shape[f].channels for f in in_features]
|
103 |
+
# Check all channel counts are equal
|
104 |
+
assert len(set(in_channels)) == 1, in_channels
|
105 |
+
in_channels = in_channels[0]
|
106 |
+
|
107 |
+
box_pooler = ROIPooler(
|
108 |
+
output_size=pooler_resolution,
|
109 |
+
scales=pooler_scales,
|
110 |
+
sampling_ratio=sampling_ratio,
|
111 |
+
pooler_type=pooler_type,
|
112 |
+
)
|
113 |
+
pooled_shape = ShapeSpec(
|
114 |
+
channels=in_channels, width=pooler_resolution, height=pooler_resolution
|
115 |
+
)
|
116 |
+
|
117 |
+
box_heads, box_predictors, proposal_matchers = [], [], []
|
118 |
+
for match_iou, bbox_reg_weights in zip(cascade_ious, cascade_bbox_reg_weights):
|
119 |
+
box_head = build_box_head(cfg, pooled_shape)
|
120 |
+
box_heads.append(box_head)
|
121 |
+
box_predictors.append(
|
122 |
+
FastRCNNOutputLayers(
|
123 |
+
cfg,
|
124 |
+
box_head.output_shape,
|
125 |
+
box2box_transform=Box2BoxTransform(weights=bbox_reg_weights),
|
126 |
+
)
|
127 |
+
)
|
128 |
+
proposal_matchers.append(Matcher([match_iou], [0, 1], allow_low_quality_matches=False))
|
129 |
+
return {
|
130 |
+
"box_in_features": in_features,
|
131 |
+
"box_pooler": box_pooler,
|
132 |
+
"box_heads": box_heads,
|
133 |
+
"box_predictors": box_predictors,
|
134 |
+
"proposal_matchers": proposal_matchers,
|
135 |
+
}
|
136 |
+
|
137 |
+
def forward(self, images, features, proposals, targets=None):
|
138 |
+
del images
|
139 |
+
if self.training:
|
140 |
+
proposals = self.label_and_sample_proposals(proposals, targets)
|
141 |
+
|
142 |
+
if self.training:
|
143 |
+
# Need targets to box head
|
144 |
+
losses = self._forward_box(features, proposals, targets)
|
145 |
+
losses.update(self._forward_mask(features, proposals))
|
146 |
+
losses.update(self._forward_keypoint(features, proposals))
|
147 |
+
return proposals, losses
|
148 |
+
else:
|
149 |
+
pred_instances = self._forward_box(features, proposals)
|
150 |
+
pred_instances = self.forward_with_given_boxes(features, pred_instances)
|
151 |
+
return pred_instances, {}
|
152 |
+
|
153 |
+
def _forward_box(self, features, proposals, targets=None):
|
154 |
+
"""
|
155 |
+
Args:
|
156 |
+
features, targets: the same as in
|
157 |
+
Same as in :meth:`ROIHeads.forward`.
|
158 |
+
proposals (list[Instances]): the per-image object proposals with
|
159 |
+
their matching ground truth.
|
160 |
+
Each has fields "proposal_boxes", and "objectness_logits",
|
161 |
+
"gt_classes", "gt_boxes".
|
162 |
+
"""
|
163 |
+
features = [features[f] for f in self.box_in_features]
|
164 |
+
head_outputs = [] # (predictor, predictions, proposals)
|
165 |
+
prev_pred_boxes = None
|
166 |
+
image_sizes = [x.image_size for x in proposals]
|
167 |
+
for k in range(self.num_cascade_stages):
|
168 |
+
if k > 0:
|
169 |
+
# The output boxes of the previous stage are used to create the input
|
170 |
+
# proposals of the next stage.
|
171 |
+
proposals = self._create_proposals_from_boxes(prev_pred_boxes, image_sizes)
|
172 |
+
if self.training:
|
173 |
+
proposals = self._match_and_label_boxes(proposals, k, targets)
|
174 |
+
predictions = self._run_stage(features, proposals, k)
|
175 |
+
prev_pred_boxes = self.box_predictor[k].predict_boxes(predictions, proposals)
|
176 |
+
head_outputs.append((self.box_predictor[k], predictions, proposals))
|
177 |
+
|
178 |
+
if self.training:
|
179 |
+
losses = {}
|
180 |
+
storage = get_event_storage()
|
181 |
+
for stage, (predictor, predictions, proposals) in enumerate(head_outputs):
|
182 |
+
with storage.name_scope("stage{}".format(stage)):
|
183 |
+
stage_losses = predictor.losses(predictions, proposals)
|
184 |
+
losses.update({k + "_stage{}".format(stage): v for k, v in stage_losses.items()})
|
185 |
+
return losses
|
186 |
+
else:
|
187 |
+
# Each is a list[Tensor] of length #image. Each tensor is Ri x (K+1)
|
188 |
+
scores_per_stage = [h[0].predict_probs(h[1], h[2]) for h in head_outputs]
|
189 |
+
|
190 |
+
# Average the scores across heads
|
191 |
+
scores = [
|
192 |
+
sum(list(scores_per_image)) * (1.0 / self.num_cascade_stages)
|
193 |
+
for scores_per_image in zip(*scores_per_stage)
|
194 |
+
]
|
195 |
+
# Use the boxes of the last head
|
196 |
+
predictor, predictions, proposals = head_outputs[-1]
|
197 |
+
boxes = predictor.predict_boxes(predictions, proposals)
|
198 |
+
pred_instances, _ = fast_rcnn_inference(
|
199 |
+
boxes,
|
200 |
+
scores,
|
201 |
+
image_sizes,
|
202 |
+
predictor.test_score_thresh,
|
203 |
+
predictor.test_nms_thresh,
|
204 |
+
predictor.test_topk_per_image,
|
205 |
+
)
|
206 |
+
return pred_instances
|
207 |
+
|
208 |
+
@torch.no_grad()
|
209 |
+
def _match_and_label_boxes(self, proposals, stage, targets):
|
210 |
+
"""
|
211 |
+
Match proposals with groundtruth using the matcher at the given stage.
|
212 |
+
Label the proposals as foreground or background based on the match.
|
213 |
+
|
214 |
+
Args:
|
215 |
+
proposals (list[Instances]): One Instances for each image, with
|
216 |
+
the field "proposal_boxes".
|
217 |
+
stage (int): the current stage
|
218 |
+
targets (list[Instances]): the ground truth instances
|
219 |
+
|
220 |
+
Returns:
|
221 |
+
list[Instances]: the same proposals, but with fields "gt_classes" and "gt_boxes"
|
222 |
+
"""
|
223 |
+
num_fg_samples, num_bg_samples = [], []
|
224 |
+
for proposals_per_image, targets_per_image in zip(proposals, targets):
|
225 |
+
match_quality_matrix = pairwise_iou(
|
226 |
+
targets_per_image.gt_boxes, proposals_per_image.proposal_boxes
|
227 |
+
)
|
228 |
+
# proposal_labels are 0 or 1
|
229 |
+
matched_idxs, proposal_labels = self.proposal_matchers[stage](match_quality_matrix)
|
230 |
+
if len(targets_per_image) > 0:
|
231 |
+
gt_classes = targets_per_image.gt_classes[matched_idxs]
|
232 |
+
# Label unmatched proposals (0 label from matcher) as background (label=num_classes)
|
233 |
+
gt_classes[proposal_labels == 0] = self.num_classes
|
234 |
+
gt_boxes = targets_per_image.gt_boxes[matched_idxs]
|
235 |
+
else:
|
236 |
+
gt_classes = torch.zeros_like(matched_idxs) + self.num_classes
|
237 |
+
gt_boxes = Boxes(
|
238 |
+
targets_per_image.gt_boxes.tensor.new_zeros((len(proposals_per_image), 4))
|
239 |
+
)
|
240 |
+
proposals_per_image.gt_classes = gt_classes
|
241 |
+
proposals_per_image.gt_boxes = gt_boxes
|
242 |
+
|
243 |
+
num_fg_samples.append((proposal_labels == 1).sum().item())
|
244 |
+
num_bg_samples.append(proposal_labels.numel() - num_fg_samples[-1])
|
245 |
+
|
246 |
+
# Log the number of fg/bg samples in each stage
|
247 |
+
storage = get_event_storage()
|
248 |
+
storage.put_scalar(
|
249 |
+
"stage{}/roi_head/num_fg_samples".format(stage),
|
250 |
+
sum(num_fg_samples) / len(num_fg_samples),
|
251 |
+
)
|
252 |
+
storage.put_scalar(
|
253 |
+
"stage{}/roi_head/num_bg_samples".format(stage),
|
254 |
+
sum(num_bg_samples) / len(num_bg_samples),
|
255 |
+
)
|
256 |
+
return proposals
|
257 |
+
|
258 |
+
def _run_stage(self, features, proposals, stage):
|
259 |
+
"""
|
260 |
+
Args:
|
261 |
+
features (list[Tensor]): #lvl input features to ROIHeads
|
262 |
+
proposals (list[Instances]): #image Instances, with the field "proposal_boxes"
|
263 |
+
stage (int): the current stage
|
264 |
+
|
265 |
+
Returns:
|
266 |
+
Same output as `FastRCNNOutputLayers.forward()`.
|
267 |
+
"""
|
268 |
+
box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals])
|
269 |
+
# The original implementation averages the losses among heads,
|
270 |
+
# but scale up the parameter gradients of the heads.
|
271 |
+
# This is equivalent to adding the losses among heads,
|
272 |
+
# but scale down the gradients on features.
|
273 |
+
if self.training:
|
274 |
+
box_features = _ScaleGradient.apply(box_features, 1.0 / self.num_cascade_stages)
|
275 |
+
box_features = self.box_head[stage](box_features)
|
276 |
+
return self.box_predictor[stage](box_features)
|
277 |
+
|
278 |
+
def _create_proposals_from_boxes(self, boxes, image_sizes):
|
279 |
+
"""
|
280 |
+
Args:
|
281 |
+
boxes (list[Tensor]): per-image predicted boxes, each of shape Ri x 4
|
282 |
+
image_sizes (list[tuple]): list of image shapes in (h, w)
|
283 |
+
|
284 |
+
Returns:
|
285 |
+
list[Instances]: per-image proposals with the given boxes.
|
286 |
+
"""
|
287 |
+
# Just like RPN, the proposals should not have gradients
|
288 |
+
boxes = [Boxes(b.detach()) for b in boxes]
|
289 |
+
proposals = []
|
290 |
+
for boxes_per_image, image_size in zip(boxes, image_sizes):
|
291 |
+
boxes_per_image.clip(image_size)
|
292 |
+
if self.training:
|
293 |
+
# do not filter empty boxes at inference time,
|
294 |
+
# because the scores from each stage need to be aligned and added later
|
295 |
+
boxes_per_image = boxes_per_image[boxes_per_image.nonempty()]
|
296 |
+
prop = Instances(image_size)
|
297 |
+
prop.proposal_boxes = boxes_per_image
|
298 |
+
proposals.append(prop)
|
299 |
+
return proposals
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/roi_heads/fast_rcnn.py
ADDED
@@ -0,0 +1,569 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
import logging
|
3 |
+
from typing import Callable, Dict, List, Optional, Tuple, Union
|
4 |
+
import torch
|
5 |
+
from torch import nn
|
6 |
+
from torch.nn import functional as F
|
7 |
+
|
8 |
+
from annotator.oneformer.detectron2.config import configurable
|
9 |
+
from annotator.oneformer.detectron2.data.detection_utils import get_fed_loss_cls_weights
|
10 |
+
from annotator.oneformer.detectron2.layers import ShapeSpec, batched_nms, cat, cross_entropy, nonzero_tuple
|
11 |
+
from annotator.oneformer.detectron2.modeling.box_regression import Box2BoxTransform, _dense_box_regression_loss
|
12 |
+
from annotator.oneformer.detectron2.structures import Boxes, Instances
|
13 |
+
from annotator.oneformer.detectron2.utils.events import get_event_storage
|
14 |
+
|
15 |
+
__all__ = ["fast_rcnn_inference", "FastRCNNOutputLayers"]
|
16 |
+
|
17 |
+
|
18 |
+
logger = logging.getLogger(__name__)
|
19 |
+
|
20 |
+
"""
|
21 |
+
Shape shorthand in this module:
|
22 |
+
|
23 |
+
N: number of images in the minibatch
|
24 |
+
R: number of ROIs, combined over all images, in the minibatch
|
25 |
+
Ri: number of ROIs in image i
|
26 |
+
K: number of foreground classes. E.g.,there are 80 foreground classes in COCO.
|
27 |
+
|
28 |
+
Naming convention:
|
29 |
+
|
30 |
+
deltas: refers to the 4-d (dx, dy, dw, dh) deltas that parameterize the box2box
|
31 |
+
transform (see :class:`box_regression.Box2BoxTransform`).
|
32 |
+
|
33 |
+
pred_class_logits: predicted class scores in [-inf, +inf]; use
|
34 |
+
softmax(pred_class_logits) to estimate P(class).
|
35 |
+
|
36 |
+
gt_classes: ground-truth classification labels in [0, K], where [0, K) represent
|
37 |
+
foreground object classes and K represents the background class.
|
38 |
+
|
39 |
+
pred_proposal_deltas: predicted box2box transform deltas for transforming proposals
|
40 |
+
to detection box predictions.
|
41 |
+
|
42 |
+
gt_proposal_deltas: ground-truth box2box transform deltas
|
43 |
+
"""
|
44 |
+
|
45 |
+
|
46 |
+
def fast_rcnn_inference(
|
47 |
+
boxes: List[torch.Tensor],
|
48 |
+
scores: List[torch.Tensor],
|
49 |
+
image_shapes: List[Tuple[int, int]],
|
50 |
+
score_thresh: float,
|
51 |
+
nms_thresh: float,
|
52 |
+
topk_per_image: int,
|
53 |
+
):
|
54 |
+
"""
|
55 |
+
Call `fast_rcnn_inference_single_image` for all images.
|
56 |
+
|
57 |
+
Args:
|
58 |
+
boxes (list[Tensor]): A list of Tensors of predicted class-specific or class-agnostic
|
59 |
+
boxes for each image. Element i has shape (Ri, K * 4) if doing
|
60 |
+
class-specific regression, or (Ri, 4) if doing class-agnostic
|
61 |
+
regression, where Ri is the number of predicted objects for image i.
|
62 |
+
This is compatible with the output of :meth:`FastRCNNOutputLayers.predict_boxes`.
|
63 |
+
scores (list[Tensor]): A list of Tensors of predicted class scores for each image.
|
64 |
+
Element i has shape (Ri, K + 1), where Ri is the number of predicted objects
|
65 |
+
for image i. Compatible with the output of :meth:`FastRCNNOutputLayers.predict_probs`.
|
66 |
+
image_shapes (list[tuple]): A list of (width, height) tuples for each image in the batch.
|
67 |
+
score_thresh (float): Only return detections with a confidence score exceeding this
|
68 |
+
threshold.
|
69 |
+
nms_thresh (float): The threshold to use for box non-maximum suppression. Value in [0, 1].
|
70 |
+
topk_per_image (int): The number of top scoring detections to return. Set < 0 to return
|
71 |
+
all detections.
|
72 |
+
|
73 |
+
Returns:
|
74 |
+
instances: (list[Instances]): A list of N instances, one for each image in the batch,
|
75 |
+
that stores the topk most confidence detections.
|
76 |
+
kept_indices: (list[Tensor]): A list of 1D tensor of length of N, each element indicates
|
77 |
+
the corresponding boxes/scores index in [0, Ri) from the input, for image i.
|
78 |
+
"""
|
79 |
+
result_per_image = [
|
80 |
+
fast_rcnn_inference_single_image(
|
81 |
+
boxes_per_image, scores_per_image, image_shape, score_thresh, nms_thresh, topk_per_image
|
82 |
+
)
|
83 |
+
for scores_per_image, boxes_per_image, image_shape in zip(scores, boxes, image_shapes)
|
84 |
+
]
|
85 |
+
return [x[0] for x in result_per_image], [x[1] for x in result_per_image]
|
86 |
+
|
87 |
+
|
88 |
+
def _log_classification_stats(pred_logits, gt_classes, prefix="fast_rcnn"):
|
89 |
+
"""
|
90 |
+
Log the classification metrics to EventStorage.
|
91 |
+
|
92 |
+
Args:
|
93 |
+
pred_logits: Rx(K+1) logits. The last column is for background class.
|
94 |
+
gt_classes: R labels
|
95 |
+
"""
|
96 |
+
num_instances = gt_classes.numel()
|
97 |
+
if num_instances == 0:
|
98 |
+
return
|
99 |
+
pred_classes = pred_logits.argmax(dim=1)
|
100 |
+
bg_class_ind = pred_logits.shape[1] - 1
|
101 |
+
|
102 |
+
fg_inds = (gt_classes >= 0) & (gt_classes < bg_class_ind)
|
103 |
+
num_fg = fg_inds.nonzero().numel()
|
104 |
+
fg_gt_classes = gt_classes[fg_inds]
|
105 |
+
fg_pred_classes = pred_classes[fg_inds]
|
106 |
+
|
107 |
+
num_false_negative = (fg_pred_classes == bg_class_ind).nonzero().numel()
|
108 |
+
num_accurate = (pred_classes == gt_classes).nonzero().numel()
|
109 |
+
fg_num_accurate = (fg_pred_classes == fg_gt_classes).nonzero().numel()
|
110 |
+
|
111 |
+
storage = get_event_storage()
|
112 |
+
storage.put_scalar(f"{prefix}/cls_accuracy", num_accurate / num_instances)
|
113 |
+
if num_fg > 0:
|
114 |
+
storage.put_scalar(f"{prefix}/fg_cls_accuracy", fg_num_accurate / num_fg)
|
115 |
+
storage.put_scalar(f"{prefix}/false_negative", num_false_negative / num_fg)
|
116 |
+
|
117 |
+
|
118 |
+
def fast_rcnn_inference_single_image(
|
119 |
+
boxes,
|
120 |
+
scores,
|
121 |
+
image_shape: Tuple[int, int],
|
122 |
+
score_thresh: float,
|
123 |
+
nms_thresh: float,
|
124 |
+
topk_per_image: int,
|
125 |
+
):
|
126 |
+
"""
|
127 |
+
Single-image inference. Return bounding-box detection results by thresholding
|
128 |
+
on scores and applying non-maximum suppression (NMS).
|
129 |
+
|
130 |
+
Args:
|
131 |
+
Same as `fast_rcnn_inference`, but with boxes, scores, and image shapes
|
132 |
+
per image.
|
133 |
+
|
134 |
+
Returns:
|
135 |
+
Same as `fast_rcnn_inference`, but for only one image.
|
136 |
+
"""
|
137 |
+
valid_mask = torch.isfinite(boxes).all(dim=1) & torch.isfinite(scores).all(dim=1)
|
138 |
+
if not valid_mask.all():
|
139 |
+
boxes = boxes[valid_mask]
|
140 |
+
scores = scores[valid_mask]
|
141 |
+
|
142 |
+
scores = scores[:, :-1]
|
143 |
+
num_bbox_reg_classes = boxes.shape[1] // 4
|
144 |
+
# Convert to Boxes to use the `clip` function ...
|
145 |
+
boxes = Boxes(boxes.reshape(-1, 4))
|
146 |
+
boxes.clip(image_shape)
|
147 |
+
boxes = boxes.tensor.view(-1, num_bbox_reg_classes, 4) # R x C x 4
|
148 |
+
|
149 |
+
# 1. Filter results based on detection scores. It can make NMS more efficient
|
150 |
+
# by filtering out low-confidence detections.
|
151 |
+
filter_mask = scores > score_thresh # R x K
|
152 |
+
# R' x 2. First column contains indices of the R predictions;
|
153 |
+
# Second column contains indices of classes.
|
154 |
+
filter_inds = filter_mask.nonzero()
|
155 |
+
if num_bbox_reg_classes == 1:
|
156 |
+
boxes = boxes[filter_inds[:, 0], 0]
|
157 |
+
else:
|
158 |
+
boxes = boxes[filter_mask]
|
159 |
+
scores = scores[filter_mask]
|
160 |
+
|
161 |
+
# 2. Apply NMS for each class independently.
|
162 |
+
keep = batched_nms(boxes, scores, filter_inds[:, 1], nms_thresh)
|
163 |
+
if topk_per_image >= 0:
|
164 |
+
keep = keep[:topk_per_image]
|
165 |
+
boxes, scores, filter_inds = boxes[keep], scores[keep], filter_inds[keep]
|
166 |
+
|
167 |
+
result = Instances(image_shape)
|
168 |
+
result.pred_boxes = Boxes(boxes)
|
169 |
+
result.scores = scores
|
170 |
+
result.pred_classes = filter_inds[:, 1]
|
171 |
+
return result, filter_inds[:, 0]
|
172 |
+
|
173 |
+
|
174 |
+
class FastRCNNOutputLayers(nn.Module):
|
175 |
+
"""
|
176 |
+
Two linear layers for predicting Fast R-CNN outputs:
|
177 |
+
|
178 |
+
1. proposal-to-detection box regression deltas
|
179 |
+
2. classification scores
|
180 |
+
"""
|
181 |
+
|
182 |
+
@configurable
|
183 |
+
def __init__(
|
184 |
+
self,
|
185 |
+
input_shape: ShapeSpec,
|
186 |
+
*,
|
187 |
+
box2box_transform,
|
188 |
+
num_classes: int,
|
189 |
+
test_score_thresh: float = 0.0,
|
190 |
+
test_nms_thresh: float = 0.5,
|
191 |
+
test_topk_per_image: int = 100,
|
192 |
+
cls_agnostic_bbox_reg: bool = False,
|
193 |
+
smooth_l1_beta: float = 0.0,
|
194 |
+
box_reg_loss_type: str = "smooth_l1",
|
195 |
+
loss_weight: Union[float, Dict[str, float]] = 1.0,
|
196 |
+
use_fed_loss: bool = False,
|
197 |
+
use_sigmoid_ce: bool = False,
|
198 |
+
get_fed_loss_cls_weights: Optional[Callable] = None,
|
199 |
+
fed_loss_num_classes: int = 50,
|
200 |
+
):
|
201 |
+
"""
|
202 |
+
NOTE: this interface is experimental.
|
203 |
+
|
204 |
+
Args:
|
205 |
+
input_shape (ShapeSpec): shape of the input feature to this module
|
206 |
+
box2box_transform (Box2BoxTransform or Box2BoxTransformRotated):
|
207 |
+
num_classes (int): number of foreground classes
|
208 |
+
test_score_thresh (float): threshold to filter predictions results.
|
209 |
+
test_nms_thresh (float): NMS threshold for prediction results.
|
210 |
+
test_topk_per_image (int): number of top predictions to produce per image.
|
211 |
+
cls_agnostic_bbox_reg (bool): whether to use class agnostic for bbox regression
|
212 |
+
smooth_l1_beta (float): transition point from L1 to L2 loss. Only used if
|
213 |
+
`box_reg_loss_type` is "smooth_l1"
|
214 |
+
box_reg_loss_type (str): Box regression loss type. One of: "smooth_l1", "giou",
|
215 |
+
"diou", "ciou"
|
216 |
+
loss_weight (float|dict): weights to use for losses. Can be single float for weighting
|
217 |
+
all losses, or a dict of individual weightings. Valid dict keys are:
|
218 |
+
* "loss_cls": applied to classification loss
|
219 |
+
* "loss_box_reg": applied to box regression loss
|
220 |
+
use_fed_loss (bool): whether to use federated loss which samples additional negative
|
221 |
+
classes to calculate the loss
|
222 |
+
use_sigmoid_ce (bool): whether to calculate the loss using weighted average of binary
|
223 |
+
cross entropy with logits. This could be used together with federated loss
|
224 |
+
get_fed_loss_cls_weights (Callable): a callable which takes dataset name and frequency
|
225 |
+
weight power, and returns the probabilities to sample negative classes for
|
226 |
+
federated loss. The implementation can be found in
|
227 |
+
detectron2/data/detection_utils.py
|
228 |
+
fed_loss_num_classes (int): number of federated classes to keep in total
|
229 |
+
"""
|
230 |
+
super().__init__()
|
231 |
+
if isinstance(input_shape, int): # some backward compatibility
|
232 |
+
input_shape = ShapeSpec(channels=input_shape)
|
233 |
+
self.num_classes = num_classes
|
234 |
+
input_size = input_shape.channels * (input_shape.width or 1) * (input_shape.height or 1)
|
235 |
+
# prediction layer for num_classes foreground classes and one background class (hence + 1)
|
236 |
+
self.cls_score = nn.Linear(input_size, num_classes + 1)
|
237 |
+
num_bbox_reg_classes = 1 if cls_agnostic_bbox_reg else num_classes
|
238 |
+
box_dim = len(box2box_transform.weights)
|
239 |
+
self.bbox_pred = nn.Linear(input_size, num_bbox_reg_classes * box_dim)
|
240 |
+
|
241 |
+
nn.init.normal_(self.cls_score.weight, std=0.01)
|
242 |
+
nn.init.normal_(self.bbox_pred.weight, std=0.001)
|
243 |
+
for l in [self.cls_score, self.bbox_pred]:
|
244 |
+
nn.init.constant_(l.bias, 0)
|
245 |
+
|
246 |
+
self.box2box_transform = box2box_transform
|
247 |
+
self.smooth_l1_beta = smooth_l1_beta
|
248 |
+
self.test_score_thresh = test_score_thresh
|
249 |
+
self.test_nms_thresh = test_nms_thresh
|
250 |
+
self.test_topk_per_image = test_topk_per_image
|
251 |
+
self.box_reg_loss_type = box_reg_loss_type
|
252 |
+
if isinstance(loss_weight, float):
|
253 |
+
loss_weight = {"loss_cls": loss_weight, "loss_box_reg": loss_weight}
|
254 |
+
self.loss_weight = loss_weight
|
255 |
+
self.use_fed_loss = use_fed_loss
|
256 |
+
self.use_sigmoid_ce = use_sigmoid_ce
|
257 |
+
self.fed_loss_num_classes = fed_loss_num_classes
|
258 |
+
|
259 |
+
if self.use_fed_loss:
|
260 |
+
assert self.use_sigmoid_ce, "Please use sigmoid cross entropy loss with federated loss"
|
261 |
+
fed_loss_cls_weights = get_fed_loss_cls_weights()
|
262 |
+
assert (
|
263 |
+
len(fed_loss_cls_weights) == self.num_classes
|
264 |
+
), "Please check the provided fed_loss_cls_weights. Their size should match num_classes"
|
265 |
+
self.register_buffer("fed_loss_cls_weights", fed_loss_cls_weights)
|
266 |
+
|
267 |
+
@classmethod
|
268 |
+
def from_config(cls, cfg, input_shape):
|
269 |
+
return {
|
270 |
+
"input_shape": input_shape,
|
271 |
+
"box2box_transform": Box2BoxTransform(weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS),
|
272 |
+
# fmt: off
|
273 |
+
"num_classes" : cfg.MODEL.ROI_HEADS.NUM_CLASSES,
|
274 |
+
"cls_agnostic_bbox_reg" : cfg.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG,
|
275 |
+
"smooth_l1_beta" : cfg.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA,
|
276 |
+
"test_score_thresh" : cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST,
|
277 |
+
"test_nms_thresh" : cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST,
|
278 |
+
"test_topk_per_image" : cfg.TEST.DETECTIONS_PER_IMAGE,
|
279 |
+
"box_reg_loss_type" : cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_LOSS_TYPE,
|
280 |
+
"loss_weight" : {"loss_box_reg": cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_LOSS_WEIGHT}, # noqa
|
281 |
+
"use_fed_loss" : cfg.MODEL.ROI_BOX_HEAD.USE_FED_LOSS,
|
282 |
+
"use_sigmoid_ce" : cfg.MODEL.ROI_BOX_HEAD.USE_SIGMOID_CE,
|
283 |
+
"get_fed_loss_cls_weights" : lambda: get_fed_loss_cls_weights(dataset_names=cfg.DATASETS.TRAIN, freq_weight_power=cfg.MODEL.ROI_BOX_HEAD.FED_LOSS_FREQ_WEIGHT_POWER), # noqa
|
284 |
+
"fed_loss_num_classes" : cfg.MODEL.ROI_BOX_HEAD.FED_LOSS_NUM_CLASSES,
|
285 |
+
# fmt: on
|
286 |
+
}
|
287 |
+
|
288 |
+
def forward(self, x):
|
289 |
+
"""
|
290 |
+
Args:
|
291 |
+
x: per-region features of shape (N, ...) for N bounding boxes to predict.
|
292 |
+
|
293 |
+
Returns:
|
294 |
+
(Tensor, Tensor):
|
295 |
+
First tensor: shape (N,K+1), scores for each of the N box. Each row contains the
|
296 |
+
scores for K object categories and 1 background class.
|
297 |
+
|
298 |
+
Second tensor: bounding box regression deltas for each box. Shape is shape (N,Kx4),
|
299 |
+
or (N,4) for class-agnostic regression.
|
300 |
+
"""
|
301 |
+
if x.dim() > 2:
|
302 |
+
x = torch.flatten(x, start_dim=1)
|
303 |
+
scores = self.cls_score(x)
|
304 |
+
proposal_deltas = self.bbox_pred(x)
|
305 |
+
return scores, proposal_deltas
|
306 |
+
|
307 |
+
def losses(self, predictions, proposals):
|
308 |
+
"""
|
309 |
+
Args:
|
310 |
+
predictions: return values of :meth:`forward()`.
|
311 |
+
proposals (list[Instances]): proposals that match the features that were used
|
312 |
+
to compute predictions. The fields ``proposal_boxes``, ``gt_boxes``,
|
313 |
+
``gt_classes`` are expected.
|
314 |
+
|
315 |
+
Returns:
|
316 |
+
Dict[str, Tensor]: dict of losses
|
317 |
+
"""
|
318 |
+
scores, proposal_deltas = predictions
|
319 |
+
|
320 |
+
# parse classification outputs
|
321 |
+
gt_classes = (
|
322 |
+
cat([p.gt_classes for p in proposals], dim=0) if len(proposals) else torch.empty(0)
|
323 |
+
)
|
324 |
+
_log_classification_stats(scores, gt_classes)
|
325 |
+
|
326 |
+
# parse box regression outputs
|
327 |
+
if len(proposals):
|
328 |
+
proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0) # Nx4
|
329 |
+
assert not proposal_boxes.requires_grad, "Proposals should not require gradients!"
|
330 |
+
# If "gt_boxes" does not exist, the proposals must be all negative and
|
331 |
+
# should not be included in regression loss computation.
|
332 |
+
# Here we just use proposal_boxes as an arbitrary placeholder because its
|
333 |
+
# value won't be used in self.box_reg_loss().
|
334 |
+
gt_boxes = cat(
|
335 |
+
[(p.gt_boxes if p.has("gt_boxes") else p.proposal_boxes).tensor for p in proposals],
|
336 |
+
dim=0,
|
337 |
+
)
|
338 |
+
else:
|
339 |
+
proposal_boxes = gt_boxes = torch.empty((0, 4), device=proposal_deltas.device)
|
340 |
+
|
341 |
+
if self.use_sigmoid_ce:
|
342 |
+
loss_cls = self.sigmoid_cross_entropy_loss(scores, gt_classes)
|
343 |
+
else:
|
344 |
+
loss_cls = cross_entropy(scores, gt_classes, reduction="mean")
|
345 |
+
|
346 |
+
losses = {
|
347 |
+
"loss_cls": loss_cls,
|
348 |
+
"loss_box_reg": self.box_reg_loss(
|
349 |
+
proposal_boxes, gt_boxes, proposal_deltas, gt_classes
|
350 |
+
),
|
351 |
+
}
|
352 |
+
return {k: v * self.loss_weight.get(k, 1.0) for k, v in losses.items()}
|
353 |
+
|
354 |
+
# Implementation from https://github.com/xingyizhou/CenterNet2/blob/master/projects/CenterNet2/centernet/modeling/roi_heads/fed_loss.py # noqa
|
355 |
+
# with slight modifications
|
356 |
+
def get_fed_loss_classes(self, gt_classes, num_fed_loss_classes, num_classes, weight):
|
357 |
+
"""
|
358 |
+
Args:
|
359 |
+
gt_classes: a long tensor of shape R that contains the gt class label of each proposal.
|
360 |
+
num_fed_loss_classes: minimum number of classes to keep when calculating federated loss.
|
361 |
+
Will sample negative classes if number of unique gt_classes is smaller than this value.
|
362 |
+
num_classes: number of foreground classes
|
363 |
+
weight: probabilities used to sample negative classes
|
364 |
+
|
365 |
+
Returns:
|
366 |
+
Tensor:
|
367 |
+
classes to keep when calculating the federated loss, including both unique gt
|
368 |
+
classes and sampled negative classes.
|
369 |
+
"""
|
370 |
+
unique_gt_classes = torch.unique(gt_classes)
|
371 |
+
prob = unique_gt_classes.new_ones(num_classes + 1).float()
|
372 |
+
prob[-1] = 0
|
373 |
+
if len(unique_gt_classes) < num_fed_loss_classes:
|
374 |
+
prob[:num_classes] = weight.float().clone()
|
375 |
+
prob[unique_gt_classes] = 0
|
376 |
+
sampled_negative_classes = torch.multinomial(
|
377 |
+
prob, num_fed_loss_classes - len(unique_gt_classes), replacement=False
|
378 |
+
)
|
379 |
+
fed_loss_classes = torch.cat([unique_gt_classes, sampled_negative_classes])
|
380 |
+
else:
|
381 |
+
fed_loss_classes = unique_gt_classes
|
382 |
+
return fed_loss_classes
|
383 |
+
|
384 |
+
# Implementation from https://github.com/xingyizhou/CenterNet2/blob/master/projects/CenterNet2/centernet/modeling/roi_heads/custom_fast_rcnn.py#L113 # noqa
|
385 |
+
# with slight modifications
|
386 |
+
def sigmoid_cross_entropy_loss(self, pred_class_logits, gt_classes):
|
387 |
+
"""
|
388 |
+
Args:
|
389 |
+
pred_class_logits: shape (N, K+1), scores for each of the N box. Each row contains the
|
390 |
+
scores for K object categories and 1 background class
|
391 |
+
gt_classes: a long tensor of shape R that contains the gt class label of each proposal.
|
392 |
+
"""
|
393 |
+
if pred_class_logits.numel() == 0:
|
394 |
+
return pred_class_logits.new_zeros([1])[0]
|
395 |
+
|
396 |
+
N = pred_class_logits.shape[0]
|
397 |
+
K = pred_class_logits.shape[1] - 1
|
398 |
+
|
399 |
+
target = pred_class_logits.new_zeros(N, K + 1)
|
400 |
+
target[range(len(gt_classes)), gt_classes] = 1
|
401 |
+
target = target[:, :K]
|
402 |
+
|
403 |
+
cls_loss = F.binary_cross_entropy_with_logits(
|
404 |
+
pred_class_logits[:, :-1], target, reduction="none"
|
405 |
+
)
|
406 |
+
|
407 |
+
if self.use_fed_loss:
|
408 |
+
fed_loss_classes = self.get_fed_loss_classes(
|
409 |
+
gt_classes,
|
410 |
+
num_fed_loss_classes=self.fed_loss_num_classes,
|
411 |
+
num_classes=K,
|
412 |
+
weight=self.fed_loss_cls_weights,
|
413 |
+
)
|
414 |
+
fed_loss_classes_mask = fed_loss_classes.new_zeros(K + 1)
|
415 |
+
fed_loss_classes_mask[fed_loss_classes] = 1
|
416 |
+
fed_loss_classes_mask = fed_loss_classes_mask[:K]
|
417 |
+
weight = fed_loss_classes_mask.view(1, K).expand(N, K).float()
|
418 |
+
else:
|
419 |
+
weight = 1
|
420 |
+
|
421 |
+
loss = torch.sum(cls_loss * weight) / N
|
422 |
+
return loss
|
423 |
+
|
424 |
+
def box_reg_loss(self, proposal_boxes, gt_boxes, pred_deltas, gt_classes):
|
425 |
+
"""
|
426 |
+
Args:
|
427 |
+
proposal_boxes/gt_boxes are tensors with the same shape (R, 4 or 5).
|
428 |
+
pred_deltas has shape (R, 4 or 5), or (R, num_classes * (4 or 5)).
|
429 |
+
gt_classes is a long tensor of shape R, the gt class label of each proposal.
|
430 |
+
R shall be the number of proposals.
|
431 |
+
"""
|
432 |
+
box_dim = proposal_boxes.shape[1] # 4 or 5
|
433 |
+
# Regression loss is only computed for foreground proposals (those matched to a GT)
|
434 |
+
fg_inds = nonzero_tuple((gt_classes >= 0) & (gt_classes < self.num_classes))[0]
|
435 |
+
if pred_deltas.shape[1] == box_dim: # cls-agnostic regression
|
436 |
+
fg_pred_deltas = pred_deltas[fg_inds]
|
437 |
+
else:
|
438 |
+
fg_pred_deltas = pred_deltas.view(-1, self.num_classes, box_dim)[
|
439 |
+
fg_inds, gt_classes[fg_inds]
|
440 |
+
]
|
441 |
+
|
442 |
+
loss_box_reg = _dense_box_regression_loss(
|
443 |
+
[proposal_boxes[fg_inds]],
|
444 |
+
self.box2box_transform,
|
445 |
+
[fg_pred_deltas.unsqueeze(0)],
|
446 |
+
[gt_boxes[fg_inds]],
|
447 |
+
...,
|
448 |
+
self.box_reg_loss_type,
|
449 |
+
self.smooth_l1_beta,
|
450 |
+
)
|
451 |
+
|
452 |
+
# The reg loss is normalized using the total number of regions (R), not the number
|
453 |
+
# of foreground regions even though the box regression loss is only defined on
|
454 |
+
# foreground regions. Why? Because doing so gives equal training influence to
|
455 |
+
# each foreground example. To see how, consider two different minibatches:
|
456 |
+
# (1) Contains a single foreground region
|
457 |
+
# (2) Contains 100 foreground regions
|
458 |
+
# If we normalize by the number of foreground regions, the single example in
|
459 |
+
# minibatch (1) will be given 100 times as much influence as each foreground
|
460 |
+
# example in minibatch (2). Normalizing by the total number of regions, R,
|
461 |
+
# means that the single example in minibatch (1) and each of the 100 examples
|
462 |
+
# in minibatch (2) are given equal influence.
|
463 |
+
return loss_box_reg / max(gt_classes.numel(), 1.0) # return 0 if empty
|
464 |
+
|
465 |
+
def inference(self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances]):
|
466 |
+
"""
|
467 |
+
Args:
|
468 |
+
predictions: return values of :meth:`forward()`.
|
469 |
+
proposals (list[Instances]): proposals that match the features that were
|
470 |
+
used to compute predictions. The ``proposal_boxes`` field is expected.
|
471 |
+
|
472 |
+
Returns:
|
473 |
+
list[Instances]: same as `fast_rcnn_inference`.
|
474 |
+
list[Tensor]: same as `fast_rcnn_inference`.
|
475 |
+
"""
|
476 |
+
boxes = self.predict_boxes(predictions, proposals)
|
477 |
+
scores = self.predict_probs(predictions, proposals)
|
478 |
+
image_shapes = [x.image_size for x in proposals]
|
479 |
+
return fast_rcnn_inference(
|
480 |
+
boxes,
|
481 |
+
scores,
|
482 |
+
image_shapes,
|
483 |
+
self.test_score_thresh,
|
484 |
+
self.test_nms_thresh,
|
485 |
+
self.test_topk_per_image,
|
486 |
+
)
|
487 |
+
|
488 |
+
def predict_boxes_for_gt_classes(self, predictions, proposals):
|
489 |
+
"""
|
490 |
+
Args:
|
491 |
+
predictions: return values of :meth:`forward()`.
|
492 |
+
proposals (list[Instances]): proposals that match the features that were used
|
493 |
+
to compute predictions. The fields ``proposal_boxes``, ``gt_classes`` are expected.
|
494 |
+
|
495 |
+
Returns:
|
496 |
+
list[Tensor]:
|
497 |
+
A list of Tensors of predicted boxes for GT classes in case of
|
498 |
+
class-specific box head. Element i of the list has shape (Ri, B), where Ri is
|
499 |
+
the number of proposals for image i and B is the box dimension (4 or 5)
|
500 |
+
"""
|
501 |
+
if not len(proposals):
|
502 |
+
return []
|
503 |
+
scores, proposal_deltas = predictions
|
504 |
+
proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0)
|
505 |
+
N, B = proposal_boxes.shape
|
506 |
+
predict_boxes = self.box2box_transform.apply_deltas(
|
507 |
+
proposal_deltas, proposal_boxes
|
508 |
+
) # Nx(KxB)
|
509 |
+
|
510 |
+
K = predict_boxes.shape[1] // B
|
511 |
+
if K > 1:
|
512 |
+
gt_classes = torch.cat([p.gt_classes for p in proposals], dim=0)
|
513 |
+
# Some proposals are ignored or have a background class. Their gt_classes
|
514 |
+
# cannot be used as index.
|
515 |
+
gt_classes = gt_classes.clamp_(0, K - 1)
|
516 |
+
|
517 |
+
predict_boxes = predict_boxes.view(N, K, B)[
|
518 |
+
torch.arange(N, dtype=torch.long, device=predict_boxes.device), gt_classes
|
519 |
+
]
|
520 |
+
num_prop_per_image = [len(p) for p in proposals]
|
521 |
+
return predict_boxes.split(num_prop_per_image)
|
522 |
+
|
523 |
+
def predict_boxes(
|
524 |
+
self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances]
|
525 |
+
):
|
526 |
+
"""
|
527 |
+
Args:
|
528 |
+
predictions: return values of :meth:`forward()`.
|
529 |
+
proposals (list[Instances]): proposals that match the features that were
|
530 |
+
used to compute predictions. The ``proposal_boxes`` field is expected.
|
531 |
+
|
532 |
+
Returns:
|
533 |
+
list[Tensor]:
|
534 |
+
A list of Tensors of predicted class-specific or class-agnostic boxes
|
535 |
+
for each image. Element i has shape (Ri, K * B) or (Ri, B), where Ri is
|
536 |
+
the number of proposals for image i and B is the box dimension (4 or 5)
|
537 |
+
"""
|
538 |
+
if not len(proposals):
|
539 |
+
return []
|
540 |
+
_, proposal_deltas = predictions
|
541 |
+
num_prop_per_image = [len(p) for p in proposals]
|
542 |
+
proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0)
|
543 |
+
predict_boxes = self.box2box_transform.apply_deltas(
|
544 |
+
proposal_deltas,
|
545 |
+
proposal_boxes,
|
546 |
+
) # Nx(KxB)
|
547 |
+
return predict_boxes.split(num_prop_per_image)
|
548 |
+
|
549 |
+
def predict_probs(
|
550 |
+
self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances]
|
551 |
+
):
|
552 |
+
"""
|
553 |
+
Args:
|
554 |
+
predictions: return values of :meth:`forward()`.
|
555 |
+
proposals (list[Instances]): proposals that match the features that were
|
556 |
+
used to compute predictions.
|
557 |
+
|
558 |
+
Returns:
|
559 |
+
list[Tensor]:
|
560 |
+
A list of Tensors of predicted class probabilities for each image.
|
561 |
+
Element i has shape (Ri, K + 1), where Ri is the number of proposals for image i.
|
562 |
+
"""
|
563 |
+
scores, _ = predictions
|
564 |
+
num_inst_per_image = [len(p) for p in proposals]
|
565 |
+
if self.use_sigmoid_ce:
|
566 |
+
probs = scores.sigmoid()
|
567 |
+
else:
|
568 |
+
probs = F.softmax(scores, dim=-1)
|
569 |
+
return probs.split(num_inst_per_image, dim=0)
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/roi_heads/keypoint_head.py
ADDED
@@ -0,0 +1,272 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
from typing import List
|
3 |
+
import torch
|
4 |
+
from torch import nn
|
5 |
+
from torch.nn import functional as F
|
6 |
+
|
7 |
+
from annotator.oneformer.detectron2.config import configurable
|
8 |
+
from annotator.oneformer.detectron2.layers import Conv2d, ConvTranspose2d, cat, interpolate
|
9 |
+
from annotator.oneformer.detectron2.structures import Instances, heatmaps_to_keypoints
|
10 |
+
from annotator.oneformer.detectron2.utils.events import get_event_storage
|
11 |
+
from annotator.oneformer.detectron2.utils.registry import Registry
|
12 |
+
|
13 |
+
_TOTAL_SKIPPED = 0
|
14 |
+
|
15 |
+
|
16 |
+
__all__ = [
|
17 |
+
"ROI_KEYPOINT_HEAD_REGISTRY",
|
18 |
+
"build_keypoint_head",
|
19 |
+
"BaseKeypointRCNNHead",
|
20 |
+
"KRCNNConvDeconvUpsampleHead",
|
21 |
+
]
|
22 |
+
|
23 |
+
|
24 |
+
ROI_KEYPOINT_HEAD_REGISTRY = Registry("ROI_KEYPOINT_HEAD")
|
25 |
+
ROI_KEYPOINT_HEAD_REGISTRY.__doc__ = """
|
26 |
+
Registry for keypoint heads, which make keypoint predictions from per-region features.
|
27 |
+
|
28 |
+
The registered object will be called with `obj(cfg, input_shape)`.
|
29 |
+
"""
|
30 |
+
|
31 |
+
|
32 |
+
def build_keypoint_head(cfg, input_shape):
|
33 |
+
"""
|
34 |
+
Build a keypoint head from `cfg.MODEL.ROI_KEYPOINT_HEAD.NAME`.
|
35 |
+
"""
|
36 |
+
name = cfg.MODEL.ROI_KEYPOINT_HEAD.NAME
|
37 |
+
return ROI_KEYPOINT_HEAD_REGISTRY.get(name)(cfg, input_shape)
|
38 |
+
|
39 |
+
|
40 |
+
def keypoint_rcnn_loss(pred_keypoint_logits, instances, normalizer):
|
41 |
+
"""
|
42 |
+
Arguments:
|
43 |
+
pred_keypoint_logits (Tensor): A tensor of shape (N, K, S, S) where N is the total number
|
44 |
+
of instances in the batch, K is the number of keypoints, and S is the side length
|
45 |
+
of the keypoint heatmap. The values are spatial logits.
|
46 |
+
instances (list[Instances]): A list of M Instances, where M is the batch size.
|
47 |
+
These instances are predictions from the model
|
48 |
+
that are in 1:1 correspondence with pred_keypoint_logits.
|
49 |
+
Each Instances should contain a `gt_keypoints` field containing a `structures.Keypoint`
|
50 |
+
instance.
|
51 |
+
normalizer (float): Normalize the loss by this amount.
|
52 |
+
If not specified, we normalize by the number of visible keypoints in the minibatch.
|
53 |
+
|
54 |
+
Returns a scalar tensor containing the loss.
|
55 |
+
"""
|
56 |
+
heatmaps = []
|
57 |
+
valid = []
|
58 |
+
|
59 |
+
keypoint_side_len = pred_keypoint_logits.shape[2]
|
60 |
+
for instances_per_image in instances:
|
61 |
+
if len(instances_per_image) == 0:
|
62 |
+
continue
|
63 |
+
keypoints = instances_per_image.gt_keypoints
|
64 |
+
heatmaps_per_image, valid_per_image = keypoints.to_heatmap(
|
65 |
+
instances_per_image.proposal_boxes.tensor, keypoint_side_len
|
66 |
+
)
|
67 |
+
heatmaps.append(heatmaps_per_image.view(-1))
|
68 |
+
valid.append(valid_per_image.view(-1))
|
69 |
+
|
70 |
+
if len(heatmaps):
|
71 |
+
keypoint_targets = cat(heatmaps, dim=0)
|
72 |
+
valid = cat(valid, dim=0).to(dtype=torch.uint8)
|
73 |
+
valid = torch.nonzero(valid).squeeze(1)
|
74 |
+
|
75 |
+
# torch.mean (in binary_cross_entropy_with_logits) doesn't
|
76 |
+
# accept empty tensors, so handle it separately
|
77 |
+
if len(heatmaps) == 0 or valid.numel() == 0:
|
78 |
+
global _TOTAL_SKIPPED
|
79 |
+
_TOTAL_SKIPPED += 1
|
80 |
+
storage = get_event_storage()
|
81 |
+
storage.put_scalar("kpts_num_skipped_batches", _TOTAL_SKIPPED, smoothing_hint=False)
|
82 |
+
return pred_keypoint_logits.sum() * 0
|
83 |
+
|
84 |
+
N, K, H, W = pred_keypoint_logits.shape
|
85 |
+
pred_keypoint_logits = pred_keypoint_logits.view(N * K, H * W)
|
86 |
+
|
87 |
+
keypoint_loss = F.cross_entropy(
|
88 |
+
pred_keypoint_logits[valid], keypoint_targets[valid], reduction="sum"
|
89 |
+
)
|
90 |
+
|
91 |
+
# If a normalizer isn't specified, normalize by the number of visible keypoints in the minibatch
|
92 |
+
if normalizer is None:
|
93 |
+
normalizer = valid.numel()
|
94 |
+
keypoint_loss /= normalizer
|
95 |
+
|
96 |
+
return keypoint_loss
|
97 |
+
|
98 |
+
|
99 |
+
def keypoint_rcnn_inference(pred_keypoint_logits: torch.Tensor, pred_instances: List[Instances]):
|
100 |
+
"""
|
101 |
+
Post process each predicted keypoint heatmap in `pred_keypoint_logits` into (x, y, score)
|
102 |
+
and add it to the `pred_instances` as a `pred_keypoints` field.
|
103 |
+
|
104 |
+
Args:
|
105 |
+
pred_keypoint_logits (Tensor): A tensor of shape (R, K, S, S) where R is the total number
|
106 |
+
of instances in the batch, K is the number of keypoints, and S is the side length of
|
107 |
+
the keypoint heatmap. The values are spatial logits.
|
108 |
+
pred_instances (list[Instances]): A list of N Instances, where N is the number of images.
|
109 |
+
|
110 |
+
Returns:
|
111 |
+
None. Each element in pred_instances will contain extra "pred_keypoints" and
|
112 |
+
"pred_keypoint_heatmaps" fields. "pred_keypoints" is a tensor of shape
|
113 |
+
(#instance, K, 3) where the last dimension corresponds to (x, y, score).
|
114 |
+
The scores are larger than 0. "pred_keypoint_heatmaps" contains the raw
|
115 |
+
keypoint logits as passed to this function.
|
116 |
+
"""
|
117 |
+
# flatten all bboxes from all images together (list[Boxes] -> Rx4 tensor)
|
118 |
+
bboxes_flat = cat([b.pred_boxes.tensor for b in pred_instances], dim=0)
|
119 |
+
|
120 |
+
pred_keypoint_logits = pred_keypoint_logits.detach()
|
121 |
+
keypoint_results = heatmaps_to_keypoints(pred_keypoint_logits, bboxes_flat.detach())
|
122 |
+
num_instances_per_image = [len(i) for i in pred_instances]
|
123 |
+
keypoint_results = keypoint_results[:, :, [0, 1, 3]].split(num_instances_per_image, dim=0)
|
124 |
+
heatmap_results = pred_keypoint_logits.split(num_instances_per_image, dim=0)
|
125 |
+
|
126 |
+
for keypoint_results_per_image, heatmap_results_per_image, instances_per_image in zip(
|
127 |
+
keypoint_results, heatmap_results, pred_instances
|
128 |
+
):
|
129 |
+
# keypoint_results_per_image is (num instances)x(num keypoints)x(x, y, score)
|
130 |
+
# heatmap_results_per_image is (num instances)x(num keypoints)x(side)x(side)
|
131 |
+
instances_per_image.pred_keypoints = keypoint_results_per_image
|
132 |
+
instances_per_image.pred_keypoint_heatmaps = heatmap_results_per_image
|
133 |
+
|
134 |
+
|
135 |
+
class BaseKeypointRCNNHead(nn.Module):
|
136 |
+
"""
|
137 |
+
Implement the basic Keypoint R-CNN losses and inference logic described in
|
138 |
+
Sec. 5 of :paper:`Mask R-CNN`.
|
139 |
+
"""
|
140 |
+
|
141 |
+
@configurable
|
142 |
+
def __init__(self, *, num_keypoints, loss_weight=1.0, loss_normalizer=1.0):
|
143 |
+
"""
|
144 |
+
NOTE: this interface is experimental.
|
145 |
+
|
146 |
+
Args:
|
147 |
+
num_keypoints (int): number of keypoints to predict
|
148 |
+
loss_weight (float): weight to multiple on the keypoint loss
|
149 |
+
loss_normalizer (float or str):
|
150 |
+
If float, divide the loss by `loss_normalizer * #images`.
|
151 |
+
If 'visible', the loss is normalized by the total number of
|
152 |
+
visible keypoints across images.
|
153 |
+
"""
|
154 |
+
super().__init__()
|
155 |
+
self.num_keypoints = num_keypoints
|
156 |
+
self.loss_weight = loss_weight
|
157 |
+
assert loss_normalizer == "visible" or isinstance(loss_normalizer, float), loss_normalizer
|
158 |
+
self.loss_normalizer = loss_normalizer
|
159 |
+
|
160 |
+
@classmethod
|
161 |
+
def from_config(cls, cfg, input_shape):
|
162 |
+
ret = {
|
163 |
+
"loss_weight": cfg.MODEL.ROI_KEYPOINT_HEAD.LOSS_WEIGHT,
|
164 |
+
"num_keypoints": cfg.MODEL.ROI_KEYPOINT_HEAD.NUM_KEYPOINTS,
|
165 |
+
}
|
166 |
+
normalize_by_visible = (
|
167 |
+
cfg.MODEL.ROI_KEYPOINT_HEAD.NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS
|
168 |
+
) # noqa
|
169 |
+
if not normalize_by_visible:
|
170 |
+
batch_size_per_image = cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE
|
171 |
+
positive_sample_fraction = cfg.MODEL.ROI_HEADS.POSITIVE_FRACTION
|
172 |
+
ret["loss_normalizer"] = (
|
173 |
+
ret["num_keypoints"] * batch_size_per_image * positive_sample_fraction
|
174 |
+
)
|
175 |
+
else:
|
176 |
+
ret["loss_normalizer"] = "visible"
|
177 |
+
return ret
|
178 |
+
|
179 |
+
def forward(self, x, instances: List[Instances]):
|
180 |
+
"""
|
181 |
+
Args:
|
182 |
+
x: input 4D region feature(s) provided by :class:`ROIHeads`.
|
183 |
+
instances (list[Instances]): contains the boxes & labels corresponding
|
184 |
+
to the input features.
|
185 |
+
Exact format is up to its caller to decide.
|
186 |
+
Typically, this is the foreground instances in training, with
|
187 |
+
"proposal_boxes" field and other gt annotations.
|
188 |
+
In inference, it contains boxes that are already predicted.
|
189 |
+
|
190 |
+
Returns:
|
191 |
+
A dict of losses if in training. The predicted "instances" if in inference.
|
192 |
+
"""
|
193 |
+
x = self.layers(x)
|
194 |
+
if self.training:
|
195 |
+
num_images = len(instances)
|
196 |
+
normalizer = (
|
197 |
+
None if self.loss_normalizer == "visible" else num_images * self.loss_normalizer
|
198 |
+
)
|
199 |
+
return {
|
200 |
+
"loss_keypoint": keypoint_rcnn_loss(x, instances, normalizer=normalizer)
|
201 |
+
* self.loss_weight
|
202 |
+
}
|
203 |
+
else:
|
204 |
+
keypoint_rcnn_inference(x, instances)
|
205 |
+
return instances
|
206 |
+
|
207 |
+
def layers(self, x):
|
208 |
+
"""
|
209 |
+
Neural network layers that makes predictions from regional input features.
|
210 |
+
"""
|
211 |
+
raise NotImplementedError
|
212 |
+
|
213 |
+
|
214 |
+
# To get torchscript support, we make the head a subclass of `nn.Sequential`.
|
215 |
+
# Therefore, to add new layers in this head class, please make sure they are
|
216 |
+
# added in the order they will be used in forward().
|
217 |
+
@ROI_KEYPOINT_HEAD_REGISTRY.register()
|
218 |
+
class KRCNNConvDeconvUpsampleHead(BaseKeypointRCNNHead, nn.Sequential):
|
219 |
+
"""
|
220 |
+
A standard keypoint head containing a series of 3x3 convs, followed by
|
221 |
+
a transpose convolution and bilinear interpolation for upsampling.
|
222 |
+
It is described in Sec. 5 of :paper:`Mask R-CNN`.
|
223 |
+
"""
|
224 |
+
|
225 |
+
@configurable
|
226 |
+
def __init__(self, input_shape, *, num_keypoints, conv_dims, **kwargs):
|
227 |
+
"""
|
228 |
+
NOTE: this interface is experimental.
|
229 |
+
|
230 |
+
Args:
|
231 |
+
input_shape (ShapeSpec): shape of the input feature
|
232 |
+
conv_dims: an iterable of output channel counts for each conv in the head
|
233 |
+
e.g. (512, 512, 512) for three convs outputting 512 channels.
|
234 |
+
"""
|
235 |
+
super().__init__(num_keypoints=num_keypoints, **kwargs)
|
236 |
+
|
237 |
+
# default up_scale to 2.0 (this can be made an option)
|
238 |
+
up_scale = 2.0
|
239 |
+
in_channels = input_shape.channels
|
240 |
+
|
241 |
+
for idx, layer_channels in enumerate(conv_dims, 1):
|
242 |
+
module = Conv2d(in_channels, layer_channels, 3, stride=1, padding=1)
|
243 |
+
self.add_module("conv_fcn{}".format(idx), module)
|
244 |
+
self.add_module("conv_fcn_relu{}".format(idx), nn.ReLU())
|
245 |
+
in_channels = layer_channels
|
246 |
+
|
247 |
+
deconv_kernel = 4
|
248 |
+
self.score_lowres = ConvTranspose2d(
|
249 |
+
in_channels, num_keypoints, deconv_kernel, stride=2, padding=deconv_kernel // 2 - 1
|
250 |
+
)
|
251 |
+
self.up_scale = up_scale
|
252 |
+
|
253 |
+
for name, param in self.named_parameters():
|
254 |
+
if "bias" in name:
|
255 |
+
nn.init.constant_(param, 0)
|
256 |
+
elif "weight" in name:
|
257 |
+
# Caffe2 implementation uses MSRAFill, which in fact
|
258 |
+
# corresponds to kaiming_normal_ in PyTorch
|
259 |
+
nn.init.kaiming_normal_(param, mode="fan_out", nonlinearity="relu")
|
260 |
+
|
261 |
+
@classmethod
|
262 |
+
def from_config(cls, cfg, input_shape):
|
263 |
+
ret = super().from_config(cfg, input_shape)
|
264 |
+
ret["input_shape"] = input_shape
|
265 |
+
ret["conv_dims"] = cfg.MODEL.ROI_KEYPOINT_HEAD.CONV_DIMS
|
266 |
+
return ret
|
267 |
+
|
268 |
+
def layers(self, x):
|
269 |
+
for layer in self:
|
270 |
+
x = layer(x)
|
271 |
+
x = interpolate(x, scale_factor=self.up_scale, mode="bilinear", align_corners=False)
|
272 |
+
return x
|
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/roi_heads/mask_head.py
ADDED
@@ -0,0 +1,298 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
from typing import List
|
3 |
+
import fvcore.nn.weight_init as weight_init
|
4 |
+
import torch
|
5 |
+
from torch import nn
|
6 |
+
from torch.nn import functional as F
|
7 |
+
|
8 |
+
from annotator.oneformer.detectron2.config import configurable
|
9 |
+
from annotator.oneformer.detectron2.layers import Conv2d, ConvTranspose2d, ShapeSpec, cat, get_norm
|
10 |
+
from annotator.oneformer.detectron2.layers.wrappers import move_device_like
|
11 |
+
from annotator.oneformer.detectron2.structures import Instances
|
12 |
+
from annotator.oneformer.detectron2.utils.events import get_event_storage
|
13 |
+
from annotator.oneformer.detectron2.utils.registry import Registry
|
14 |
+
|
15 |
+
__all__ = [
|
16 |
+
"BaseMaskRCNNHead",
|
17 |
+
"MaskRCNNConvUpsampleHead",
|
18 |
+
"build_mask_head",
|
19 |
+
"ROI_MASK_HEAD_REGISTRY",
|
20 |
+
]
|
21 |
+
|
22 |
+
|
23 |
+
ROI_MASK_HEAD_REGISTRY = Registry("ROI_MASK_HEAD")
|
24 |
+
ROI_MASK_HEAD_REGISTRY.__doc__ = """
|
25 |
+
Registry for mask heads, which predicts instance masks given
|
26 |
+
per-region features.
|
27 |
+
|
28 |
+
The registered object will be called with `obj(cfg, input_shape)`.
|
29 |
+
"""
|
30 |
+
|
31 |
+
|
32 |
+
@torch.jit.unused
|
33 |
+
def mask_rcnn_loss(pred_mask_logits: torch.Tensor, instances: List[Instances], vis_period: int = 0):
|
34 |
+
"""
|
35 |
+
Compute the mask prediction loss defined in the Mask R-CNN paper.
|
36 |
+
|
37 |
+
Args:
|
38 |
+
pred_mask_logits (Tensor): A tensor of shape (B, C, Hmask, Wmask) or (B, 1, Hmask, Wmask)
|
39 |
+
for class-specific or class-agnostic, where B is the total number of predicted masks
|
40 |
+
in all images, C is the number of foreground classes, and Hmask, Wmask are the height
|
41 |
+
and width of the mask predictions. The values are logits.
|
42 |
+
instances (list[Instances]): A list of N Instances, where N is the number of images
|
43 |
+
in the batch. These instances are in 1:1
|
44 |
+
correspondence with the pred_mask_logits. The ground-truth labels (class, box, mask,
|
45 |
+
...) associated with each instance are stored in fields.
|
46 |
+
vis_period (int): the period (in steps) to dump visualization.
|
47 |
+
|
48 |
+
Returns:
|
49 |
+
mask_loss (Tensor): A scalar tensor containing the loss.
|
50 |
+
"""
|
51 |
+
cls_agnostic_mask = pred_mask_logits.size(1) == 1
|
52 |
+
total_num_masks = pred_mask_logits.size(0)
|
53 |
+
mask_side_len = pred_mask_logits.size(2)
|
54 |
+
assert pred_mask_logits.size(2) == pred_mask_logits.size(3), "Mask prediction must be square!"
|
55 |
+
|
56 |
+
gt_classes = []
|
57 |
+
gt_masks = []
|
58 |
+
for instances_per_image in instances:
|
59 |
+
if len(instances_per_image) == 0:
|
60 |
+
continue
|
61 |
+
if not cls_agnostic_mask:
|
62 |
+
gt_classes_per_image = instances_per_image.gt_classes.to(dtype=torch.int64)
|
63 |
+
gt_classes.append(gt_classes_per_image)
|
64 |
+
|
65 |
+
gt_masks_per_image = instances_per_image.gt_masks.crop_and_resize(
|
66 |
+
instances_per_image.proposal_boxes.tensor, mask_side_len
|
67 |
+
).to(device=pred_mask_logits.device)
|
68 |
+
# A tensor of shape (N, M, M), N=#instances in the image; M=mask_side_len
|
69 |
+
gt_masks.append(gt_masks_per_image)
|
70 |
+
|
71 |
+
if len(gt_masks) == 0:
|
72 |
+
return pred_mask_logits.sum() * 0
|
73 |
+
|
74 |
+
gt_masks = cat(gt_masks, dim=0)
|
75 |
+
|
76 |
+
if cls_agnostic_mask:
|
77 |
+
pred_mask_logits = pred_mask_logits[:, 0]
|
78 |
+
else:
|
79 |
+
indices = torch.arange(total_num_masks)
|
80 |
+
gt_classes = cat(gt_classes, dim=0)
|
81 |
+
pred_mask_logits = pred_mask_logits[indices, gt_classes]
|
82 |
+
|
83 |
+
if gt_masks.dtype == torch.bool:
|
84 |
+
gt_masks_bool = gt_masks
|
85 |
+
else:
|
86 |
+
# Here we allow gt_masks to be float as well (depend on the implementation of rasterize())
|
87 |
+
gt_masks_bool = gt_masks > 0.5
|
88 |
+
gt_masks = gt_masks.to(dtype=torch.float32)
|
89 |
+
|
90 |
+
# Log the training accuracy (using gt classes and 0.5 threshold)
|
91 |
+
mask_incorrect = (pred_mask_logits > 0.0) != gt_masks_bool
|
92 |
+
mask_accuracy = 1 - (mask_incorrect.sum().item() / max(mask_incorrect.numel(), 1.0))
|
93 |
+
num_positive = gt_masks_bool.sum().item()
|
94 |
+
false_positive = (mask_incorrect & ~gt_masks_bool).sum().item() / max(
|
95 |
+
gt_masks_bool.numel() - num_positive, 1.0
|
96 |
+
)
|
97 |
+
false_negative = (mask_incorrect & gt_masks_bool).sum().item() / max(num_positive, 1.0)
|
98 |
+
|
99 |
+
storage = get_event_storage()
|
100 |
+
storage.put_scalar("mask_rcnn/accuracy", mask_accuracy)
|
101 |
+
storage.put_scalar("mask_rcnn/false_positive", false_positive)
|
102 |
+
storage.put_scalar("mask_rcnn/false_negative", false_negative)
|
103 |
+
if vis_period > 0 and storage.iter % vis_period == 0:
|
104 |
+
pred_masks = pred_mask_logits.sigmoid()
|
105 |
+
vis_masks = torch.cat([pred_masks, gt_masks], axis=2)
|
106 |
+
name = "Left: mask prediction; Right: mask GT"
|
107 |
+
for idx, vis_mask in enumerate(vis_masks):
|
108 |
+
vis_mask = torch.stack([vis_mask] * 3, axis=0)
|
109 |
+
storage.put_image(name + f" ({idx})", vis_mask)
|
110 |
+
|
111 |
+
mask_loss = F.binary_cross_entropy_with_logits(pred_mask_logits, gt_masks, reduction="mean")
|
112 |
+
return mask_loss
|
113 |
+
|
114 |
+
|
115 |
+
def mask_rcnn_inference(pred_mask_logits: torch.Tensor, pred_instances: List[Instances]):
|
116 |
+
"""
|
117 |
+
Convert pred_mask_logits to estimated foreground probability masks while also
|
118 |
+
extracting only the masks for the predicted classes in pred_instances. For each
|
119 |
+
predicted box, the mask of the same class is attached to the instance by adding a
|
120 |
+
new "pred_masks" field to pred_instances.
|
121 |
+
|
122 |
+
Args:
|
123 |
+
pred_mask_logits (Tensor): A tensor of shape (B, C, Hmask, Wmask) or (B, 1, Hmask, Wmask)
|
124 |
+
for class-specific or class-agnostic, where B is the total number of predicted masks
|
125 |
+
in all images, C is the number of foreground classes, and Hmask, Wmask are the height
|
126 |
+
and width of the mask predictions. The values are logits.
|
127 |
+
pred_instances (list[Instances]): A list of N Instances, where N is the number of images
|
128 |
+
in the batch. Each Instances must have field "pred_classes".
|
129 |
+
|
130 |
+
Returns:
|
131 |
+
None. pred_instances will contain an extra "pred_masks" field storing a mask of size (Hmask,
|
132 |
+
Wmask) for predicted class. Note that the masks are returned as a soft (non-quantized)
|
133 |
+
masks the resolution predicted by the network; post-processing steps, such as resizing
|
134 |
+
the predicted masks to the original image resolution and/or binarizing them, is left
|
135 |
+
to the caller.
|
136 |
+
"""
|
137 |
+
cls_agnostic_mask = pred_mask_logits.size(1) == 1
|
138 |
+
|
139 |
+
if cls_agnostic_mask:
|
140 |
+
mask_probs_pred = pred_mask_logits.sigmoid()
|
141 |
+
else:
|
142 |
+
# Select masks corresponding to the predicted classes
|
143 |
+
num_masks = pred_mask_logits.shape[0]
|
144 |
+
class_pred = cat([i.pred_classes for i in pred_instances])
|
145 |
+
device = (
|
146 |
+
class_pred.device
|
147 |
+
if torch.jit.is_scripting()
|
148 |
+
else ("cpu" if torch.jit.is_tracing() else class_pred.device)
|
149 |
+
)
|
150 |
+
indices = move_device_like(torch.arange(num_masks, device=device), class_pred)
|
151 |
+
mask_probs_pred = pred_mask_logits[indices, class_pred][:, None].sigmoid()
|
152 |
+
# mask_probs_pred.shape: (B, 1, Hmask, Wmask)
|
153 |
+
|
154 |
+
num_boxes_per_image = [len(i) for i in pred_instances]
|
155 |
+
mask_probs_pred = mask_probs_pred.split(num_boxes_per_image, dim=0)
|
156 |
+
|
157 |
+
for prob, instances in zip(mask_probs_pred, pred_instances):
|
158 |
+
instances.pred_masks = prob # (1, Hmask, Wmask)
|
159 |
+
|
160 |
+
|
161 |
+
class BaseMaskRCNNHead(nn.Module):
|
162 |
+
"""
|
163 |
+
Implement the basic Mask R-CNN losses and inference logic described in :paper:`Mask R-CNN`
|
164 |
+
"""
|
165 |
+
|
166 |
+
@configurable
|
167 |
+
def __init__(self, *, loss_weight: float = 1.0, vis_period: int = 0):
|
168 |
+
"""
|
169 |
+
NOTE: this interface is experimental.
|
170 |
+
|
171 |
+
Args:
|
172 |
+
loss_weight (float): multiplier of the loss
|
173 |
+
vis_period (int): visualization period
|
174 |
+
"""
|
175 |
+
super().__init__()
|
176 |
+
self.vis_period = vis_period
|
177 |
+
self.loss_weight = loss_weight
|
178 |
+
|
179 |
+
@classmethod
|
180 |
+
def from_config(cls, cfg, input_shape):
|
181 |
+
return {"vis_period": cfg.VIS_PERIOD}
|
182 |
+
|
183 |
+
def forward(self, x, instances: List[Instances]):
|
184 |
+
"""
|
185 |
+
Args:
|
186 |
+
x: input region feature(s) provided by :class:`ROIHeads`.
|
187 |
+
instances (list[Instances]): contains the boxes & labels corresponding
|
188 |
+
to the input features.
|
189 |
+
Exact format is up to its caller to decide.
|
190 |
+
Typically, this is the foreground instances in training, with
|
191 |
+
"proposal_boxes" field and other gt annotations.
|
192 |
+
In inference, it contains boxes that are already predicted.
|
193 |
+
|
194 |
+
Returns:
|
195 |
+
A dict of losses in training. The predicted "instances" in inference.
|
196 |
+
"""
|
197 |
+
x = self.layers(x)
|
198 |
+
if self.training:
|
199 |
+
return {"loss_mask": mask_rcnn_loss(x, instances, self.vis_period) * self.loss_weight}
|
200 |
+
else:
|
201 |
+
mask_rcnn_inference(x, instances)
|
202 |
+
return instances
|
203 |
+
|
204 |
+
def layers(self, x):
|
205 |
+
"""
|
206 |
+
Neural network layers that makes predictions from input features.
|
207 |
+
"""
|
208 |
+
raise NotImplementedError
|
209 |
+
|
210 |
+
|
211 |
+
# To get torchscript support, we make the head a subclass of `nn.Sequential`.
|
212 |
+
# Therefore, to add new layers in this head class, please make sure they are
|
213 |
+
# added in the order they will be used in forward().
|
214 |
+
@ROI_MASK_HEAD_REGISTRY.register()
|
215 |
+
class MaskRCNNConvUpsampleHead(BaseMaskRCNNHead, nn.Sequential):
|
216 |
+
"""
|
217 |
+
A mask head with several conv layers, plus an upsample layer (with `ConvTranspose2d`).
|
218 |
+
Predictions are made with a final 1x1 conv layer.
|
219 |
+
"""
|
220 |
+
|
221 |
+
@configurable
|
222 |
+
def __init__(self, input_shape: ShapeSpec, *, num_classes, conv_dims, conv_norm="", **kwargs):
|
223 |
+
"""
|
224 |
+
NOTE: this interface is experimental.
|
225 |
+
|
226 |
+
Args:
|
227 |
+
input_shape (ShapeSpec): shape of the input feature
|
228 |
+
num_classes (int): the number of foreground classes (i.e. background is not
|
229 |
+
included). 1 if using class agnostic prediction.
|
230 |
+
conv_dims (list[int]): a list of N>0 integers representing the output dimensions
|
231 |
+
of N-1 conv layers and the last upsample layer.
|
232 |
+
conv_norm (str or callable): normalization for the conv layers.
|
233 |
+
See :func:`detectron2.layers.get_norm` for supported types.
|
234 |
+
"""
|
235 |
+
super().__init__(**kwargs)
|
236 |
+
assert len(conv_dims) >= 1, "conv_dims have to be non-empty!"
|
237 |
+
|
238 |
+
self.conv_norm_relus = []
|
239 |
+
|
240 |
+
cur_channels = input_shape.channels
|
241 |
+
for k, conv_dim in enumerate(conv_dims[:-1]):
|
242 |
+
conv = Conv2d(
|
243 |
+
cur_channels,
|
244 |
+
conv_dim,
|
245 |
+
kernel_size=3,
|
246 |
+
stride=1,
|
247 |
+
padding=1,
|
248 |
+
bias=not conv_norm,
|
249 |
+
norm=get_norm(conv_norm, conv_dim),
|
250 |
+
activation=nn.ReLU(),
|
251 |
+
)
|
252 |
+
self.add_module("mask_fcn{}".format(k + 1), conv)
|
253 |
+
self.conv_norm_relus.append(conv)
|
254 |
+
cur_channels = conv_dim
|
255 |
+
|
256 |
+
self.deconv = ConvTranspose2d(
|
257 |
+
cur_channels, conv_dims[-1], kernel_size=2, stride=2, padding=0
|
258 |
+
)
|
259 |
+
self.add_module("deconv_relu", nn.ReLU())
|
260 |
+
cur_channels = conv_dims[-1]
|
261 |
+
|
262 |
+
self.predictor = Conv2d(cur_channels, num_classes, kernel_size=1, stride=1, padding=0)
|
263 |
+
|
264 |
+
for layer in self.conv_norm_relus + [self.deconv]:
|
265 |
+
weight_init.c2_msra_fill(layer)
|
266 |
+
# use normal distribution initialization for mask prediction layer
|
267 |
+
nn.init.normal_(self.predictor.weight, std=0.001)
|
268 |
+
if self.predictor.bias is not None:
|
269 |
+
nn.init.constant_(self.predictor.bias, 0)
|
270 |
+
|
271 |
+
@classmethod
|
272 |
+
def from_config(cls, cfg, input_shape):
|
273 |
+
ret = super().from_config(cfg, input_shape)
|
274 |
+
conv_dim = cfg.MODEL.ROI_MASK_HEAD.CONV_DIM
|
275 |
+
num_conv = cfg.MODEL.ROI_MASK_HEAD.NUM_CONV
|
276 |
+
ret.update(
|
277 |
+
conv_dims=[conv_dim] * (num_conv + 1), # +1 for ConvTranspose
|
278 |
+
conv_norm=cfg.MODEL.ROI_MASK_HEAD.NORM,
|
279 |
+
input_shape=input_shape,
|
280 |
+
)
|
281 |
+
if cfg.MODEL.ROI_MASK_HEAD.CLS_AGNOSTIC_MASK:
|
282 |
+
ret["num_classes"] = 1
|
283 |
+
else:
|
284 |
+
ret["num_classes"] = cfg.MODEL.ROI_HEADS.NUM_CLASSES
|
285 |
+
return ret
|
286 |
+
|
287 |
+
def layers(self, x):
|
288 |
+
for layer in self:
|
289 |
+
x = layer(x)
|
290 |
+
return x
|
291 |
+
|
292 |
+
|
293 |
+
def build_mask_head(cfg, input_shape):
|
294 |
+
"""
|
295 |
+
Build a mask head defined by `cfg.MODEL.ROI_MASK_HEAD.NAME`.
|
296 |
+
"""
|
297 |
+
name = cfg.MODEL.ROI_MASK_HEAD.NAME
|
298 |
+
return ROI_MASK_HEAD_REGISTRY.get(name)(cfg, input_shape)
|