5c92d4965b9c0b7f421c923eaf90c134e60fd25cc660369d2b79e6de7c3e2f37
Browse files- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/deeplabv3_unet_s5-d16.py +50 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/deeplabv3plus_r50-d8.py +46 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/dmnet_r50-d8.py +44 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/dnl_r50-d8.py +46 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/emanet_r50-d8.py +47 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/encnet_r50-d8.py +48 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/fast_scnn.py +57 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/fcn_hr18.py +52 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/fcn_r50-d8.py +45 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/fcn_unet_s5-d16.py +51 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/fpn_r50.py +36 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/fpn_uniformer.py +35 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/gcnet_r50-d8.py +46 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/lraspp_m-v3-d8.py +25 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/nonlocal_r50-d8.py +46 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/ocrnet_hr18.py +68 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/ocrnet_r50-d8.py +47 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/pointrend_r50.py +56 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/psanet_r50-d8.py +49 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/pspnet_r50-d8.py +44 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/pspnet_unet_s5-d16.py +50 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/upernet_r50.py +44 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/upernet_uniformer.py +43 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/schedules/schedule_160k.py +9 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/schedules/schedule_20k.py +9 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/schedules/schedule_40k.py +9 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/schedules/schedule_80k.py +9 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/inference.py +144 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/mmcv_custom/__init__.py +5 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/mmcv_custom/checkpoint.py +508 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/uniformer.py +426 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/upernet_global_small.py +44 -0
- extensions/microsoftexcel-controlnet/annotator/util.py +79 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/LICENSE +21 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/__init__.py +59 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/__init__.py +24 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/__init__.py +24 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas.py +379 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/.gitignore +110 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/Dockerfile +29 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/LICENSE +21 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/README.md +259 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/environment.yaml +16 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/hubconf.py +435 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/input/.placeholder +0 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/beit.py +196 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/levit.py +106 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/next_vit.py +39 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/swin.py +13 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/swin2.py +34 -0
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/deeplabv3_unet_s5-d16.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# model settings
|
2 |
+
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
3 |
+
model = dict(
|
4 |
+
type='EncoderDecoder',
|
5 |
+
pretrained=None,
|
6 |
+
backbone=dict(
|
7 |
+
type='UNet',
|
8 |
+
in_channels=3,
|
9 |
+
base_channels=64,
|
10 |
+
num_stages=5,
|
11 |
+
strides=(1, 1, 1, 1, 1),
|
12 |
+
enc_num_convs=(2, 2, 2, 2, 2),
|
13 |
+
dec_num_convs=(2, 2, 2, 2),
|
14 |
+
downsamples=(True, True, True, True),
|
15 |
+
enc_dilations=(1, 1, 1, 1, 1),
|
16 |
+
dec_dilations=(1, 1, 1, 1),
|
17 |
+
with_cp=False,
|
18 |
+
conv_cfg=None,
|
19 |
+
norm_cfg=norm_cfg,
|
20 |
+
act_cfg=dict(type='ReLU'),
|
21 |
+
upsample_cfg=dict(type='InterpConv'),
|
22 |
+
norm_eval=False),
|
23 |
+
decode_head=dict(
|
24 |
+
type='ASPPHead',
|
25 |
+
in_channels=64,
|
26 |
+
in_index=4,
|
27 |
+
channels=16,
|
28 |
+
dilations=(1, 12, 24, 36),
|
29 |
+
dropout_ratio=0.1,
|
30 |
+
num_classes=2,
|
31 |
+
norm_cfg=norm_cfg,
|
32 |
+
align_corners=False,
|
33 |
+
loss_decode=dict(
|
34 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
35 |
+
auxiliary_head=dict(
|
36 |
+
type='FCNHead',
|
37 |
+
in_channels=128,
|
38 |
+
in_index=3,
|
39 |
+
channels=64,
|
40 |
+
num_convs=1,
|
41 |
+
concat_input=False,
|
42 |
+
dropout_ratio=0.1,
|
43 |
+
num_classes=2,
|
44 |
+
norm_cfg=norm_cfg,
|
45 |
+
align_corners=False,
|
46 |
+
loss_decode=dict(
|
47 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
48 |
+
# model training and testing settings
|
49 |
+
train_cfg=dict(),
|
50 |
+
test_cfg=dict(mode='slide', crop_size=256, stride=170))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/deeplabv3plus_r50-d8.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# model settings
|
2 |
+
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
3 |
+
model = dict(
|
4 |
+
type='EncoderDecoder',
|
5 |
+
pretrained='open-mmlab://resnet50_v1c',
|
6 |
+
backbone=dict(
|
7 |
+
type='ResNetV1c',
|
8 |
+
depth=50,
|
9 |
+
num_stages=4,
|
10 |
+
out_indices=(0, 1, 2, 3),
|
11 |
+
dilations=(1, 1, 2, 4),
|
12 |
+
strides=(1, 2, 1, 1),
|
13 |
+
norm_cfg=norm_cfg,
|
14 |
+
norm_eval=False,
|
15 |
+
style='pytorch',
|
16 |
+
contract_dilation=True),
|
17 |
+
decode_head=dict(
|
18 |
+
type='DepthwiseSeparableASPPHead',
|
19 |
+
in_channels=2048,
|
20 |
+
in_index=3,
|
21 |
+
channels=512,
|
22 |
+
dilations=(1, 12, 24, 36),
|
23 |
+
c1_in_channels=256,
|
24 |
+
c1_channels=48,
|
25 |
+
dropout_ratio=0.1,
|
26 |
+
num_classes=19,
|
27 |
+
norm_cfg=norm_cfg,
|
28 |
+
align_corners=False,
|
29 |
+
loss_decode=dict(
|
30 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
31 |
+
auxiliary_head=dict(
|
32 |
+
type='FCNHead',
|
33 |
+
in_channels=1024,
|
34 |
+
in_index=2,
|
35 |
+
channels=256,
|
36 |
+
num_convs=1,
|
37 |
+
concat_input=False,
|
38 |
+
dropout_ratio=0.1,
|
39 |
+
num_classes=19,
|
40 |
+
norm_cfg=norm_cfg,
|
41 |
+
align_corners=False,
|
42 |
+
loss_decode=dict(
|
43 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
44 |
+
# model training and testing settings
|
45 |
+
train_cfg=dict(),
|
46 |
+
test_cfg=dict(mode='whole'))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/dmnet_r50-d8.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# model settings
|
2 |
+
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
3 |
+
model = dict(
|
4 |
+
type='EncoderDecoder',
|
5 |
+
pretrained='open-mmlab://resnet50_v1c',
|
6 |
+
backbone=dict(
|
7 |
+
type='ResNetV1c',
|
8 |
+
depth=50,
|
9 |
+
num_stages=4,
|
10 |
+
out_indices=(0, 1, 2, 3),
|
11 |
+
dilations=(1, 1, 2, 4),
|
12 |
+
strides=(1, 2, 1, 1),
|
13 |
+
norm_cfg=norm_cfg,
|
14 |
+
norm_eval=False,
|
15 |
+
style='pytorch',
|
16 |
+
contract_dilation=True),
|
17 |
+
decode_head=dict(
|
18 |
+
type='DMHead',
|
19 |
+
in_channels=2048,
|
20 |
+
in_index=3,
|
21 |
+
channels=512,
|
22 |
+
filter_sizes=(1, 3, 5, 7),
|
23 |
+
dropout_ratio=0.1,
|
24 |
+
num_classes=19,
|
25 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
26 |
+
align_corners=False,
|
27 |
+
loss_decode=dict(
|
28 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
29 |
+
auxiliary_head=dict(
|
30 |
+
type='FCNHead',
|
31 |
+
in_channels=1024,
|
32 |
+
in_index=2,
|
33 |
+
channels=256,
|
34 |
+
num_convs=1,
|
35 |
+
concat_input=False,
|
36 |
+
dropout_ratio=0.1,
|
37 |
+
num_classes=19,
|
38 |
+
norm_cfg=norm_cfg,
|
39 |
+
align_corners=False,
|
40 |
+
loss_decode=dict(
|
41 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
42 |
+
# model training and testing settings
|
43 |
+
train_cfg=dict(),
|
44 |
+
test_cfg=dict(mode='whole'))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/dnl_r50-d8.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# model settings
|
2 |
+
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
3 |
+
model = dict(
|
4 |
+
type='EncoderDecoder',
|
5 |
+
pretrained='open-mmlab://resnet50_v1c',
|
6 |
+
backbone=dict(
|
7 |
+
type='ResNetV1c',
|
8 |
+
depth=50,
|
9 |
+
num_stages=4,
|
10 |
+
out_indices=(0, 1, 2, 3),
|
11 |
+
dilations=(1, 1, 2, 4),
|
12 |
+
strides=(1, 2, 1, 1),
|
13 |
+
norm_cfg=norm_cfg,
|
14 |
+
norm_eval=False,
|
15 |
+
style='pytorch',
|
16 |
+
contract_dilation=True),
|
17 |
+
decode_head=dict(
|
18 |
+
type='DNLHead',
|
19 |
+
in_channels=2048,
|
20 |
+
in_index=3,
|
21 |
+
channels=512,
|
22 |
+
dropout_ratio=0.1,
|
23 |
+
reduction=2,
|
24 |
+
use_scale=True,
|
25 |
+
mode='embedded_gaussian',
|
26 |
+
num_classes=19,
|
27 |
+
norm_cfg=norm_cfg,
|
28 |
+
align_corners=False,
|
29 |
+
loss_decode=dict(
|
30 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
31 |
+
auxiliary_head=dict(
|
32 |
+
type='FCNHead',
|
33 |
+
in_channels=1024,
|
34 |
+
in_index=2,
|
35 |
+
channels=256,
|
36 |
+
num_convs=1,
|
37 |
+
concat_input=False,
|
38 |
+
dropout_ratio=0.1,
|
39 |
+
num_classes=19,
|
40 |
+
norm_cfg=norm_cfg,
|
41 |
+
align_corners=False,
|
42 |
+
loss_decode=dict(
|
43 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
44 |
+
# model training and testing settings
|
45 |
+
train_cfg=dict(),
|
46 |
+
test_cfg=dict(mode='whole'))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/emanet_r50-d8.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# model settings
|
2 |
+
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
3 |
+
model = dict(
|
4 |
+
type='EncoderDecoder',
|
5 |
+
pretrained='open-mmlab://resnet50_v1c',
|
6 |
+
backbone=dict(
|
7 |
+
type='ResNetV1c',
|
8 |
+
depth=50,
|
9 |
+
num_stages=4,
|
10 |
+
out_indices=(0, 1, 2, 3),
|
11 |
+
dilations=(1, 1, 2, 4),
|
12 |
+
strides=(1, 2, 1, 1),
|
13 |
+
norm_cfg=norm_cfg,
|
14 |
+
norm_eval=False,
|
15 |
+
style='pytorch',
|
16 |
+
contract_dilation=True),
|
17 |
+
decode_head=dict(
|
18 |
+
type='EMAHead',
|
19 |
+
in_channels=2048,
|
20 |
+
in_index=3,
|
21 |
+
channels=256,
|
22 |
+
ema_channels=512,
|
23 |
+
num_bases=64,
|
24 |
+
num_stages=3,
|
25 |
+
momentum=0.1,
|
26 |
+
dropout_ratio=0.1,
|
27 |
+
num_classes=19,
|
28 |
+
norm_cfg=norm_cfg,
|
29 |
+
align_corners=False,
|
30 |
+
loss_decode=dict(
|
31 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
32 |
+
auxiliary_head=dict(
|
33 |
+
type='FCNHead',
|
34 |
+
in_channels=1024,
|
35 |
+
in_index=2,
|
36 |
+
channels=256,
|
37 |
+
num_convs=1,
|
38 |
+
concat_input=False,
|
39 |
+
dropout_ratio=0.1,
|
40 |
+
num_classes=19,
|
41 |
+
norm_cfg=norm_cfg,
|
42 |
+
align_corners=False,
|
43 |
+
loss_decode=dict(
|
44 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
45 |
+
# model training and testing settings
|
46 |
+
train_cfg=dict(),
|
47 |
+
test_cfg=dict(mode='whole'))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/encnet_r50-d8.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# model settings
|
2 |
+
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
3 |
+
model = dict(
|
4 |
+
type='EncoderDecoder',
|
5 |
+
pretrained='open-mmlab://resnet50_v1c',
|
6 |
+
backbone=dict(
|
7 |
+
type='ResNetV1c',
|
8 |
+
depth=50,
|
9 |
+
num_stages=4,
|
10 |
+
out_indices=(0, 1, 2, 3),
|
11 |
+
dilations=(1, 1, 2, 4),
|
12 |
+
strides=(1, 2, 1, 1),
|
13 |
+
norm_cfg=norm_cfg,
|
14 |
+
norm_eval=False,
|
15 |
+
style='pytorch',
|
16 |
+
contract_dilation=True),
|
17 |
+
decode_head=dict(
|
18 |
+
type='EncHead',
|
19 |
+
in_channels=[512, 1024, 2048],
|
20 |
+
in_index=(1, 2, 3),
|
21 |
+
channels=512,
|
22 |
+
num_codes=32,
|
23 |
+
use_se_loss=True,
|
24 |
+
add_lateral=False,
|
25 |
+
dropout_ratio=0.1,
|
26 |
+
num_classes=19,
|
27 |
+
norm_cfg=norm_cfg,
|
28 |
+
align_corners=False,
|
29 |
+
loss_decode=dict(
|
30 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
|
31 |
+
loss_se_decode=dict(
|
32 |
+
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.2)),
|
33 |
+
auxiliary_head=dict(
|
34 |
+
type='FCNHead',
|
35 |
+
in_channels=1024,
|
36 |
+
in_index=2,
|
37 |
+
channels=256,
|
38 |
+
num_convs=1,
|
39 |
+
concat_input=False,
|
40 |
+
dropout_ratio=0.1,
|
41 |
+
num_classes=19,
|
42 |
+
norm_cfg=norm_cfg,
|
43 |
+
align_corners=False,
|
44 |
+
loss_decode=dict(
|
45 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
46 |
+
# model training and testing settings
|
47 |
+
train_cfg=dict(),
|
48 |
+
test_cfg=dict(mode='whole'))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/fast_scnn.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# model settings
|
2 |
+
norm_cfg = dict(type='SyncBN', requires_grad=True, momentum=0.01)
|
3 |
+
model = dict(
|
4 |
+
type='EncoderDecoder',
|
5 |
+
backbone=dict(
|
6 |
+
type='FastSCNN',
|
7 |
+
downsample_dw_channels=(32, 48),
|
8 |
+
global_in_channels=64,
|
9 |
+
global_block_channels=(64, 96, 128),
|
10 |
+
global_block_strides=(2, 2, 1),
|
11 |
+
global_out_channels=128,
|
12 |
+
higher_in_channels=64,
|
13 |
+
lower_in_channels=128,
|
14 |
+
fusion_out_channels=128,
|
15 |
+
out_indices=(0, 1, 2),
|
16 |
+
norm_cfg=norm_cfg,
|
17 |
+
align_corners=False),
|
18 |
+
decode_head=dict(
|
19 |
+
type='DepthwiseSeparableFCNHead',
|
20 |
+
in_channels=128,
|
21 |
+
channels=128,
|
22 |
+
concat_input=False,
|
23 |
+
num_classes=19,
|
24 |
+
in_index=-1,
|
25 |
+
norm_cfg=norm_cfg,
|
26 |
+
align_corners=False,
|
27 |
+
loss_decode=dict(
|
28 |
+
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)),
|
29 |
+
auxiliary_head=[
|
30 |
+
dict(
|
31 |
+
type='FCNHead',
|
32 |
+
in_channels=128,
|
33 |
+
channels=32,
|
34 |
+
num_convs=1,
|
35 |
+
num_classes=19,
|
36 |
+
in_index=-2,
|
37 |
+
norm_cfg=norm_cfg,
|
38 |
+
concat_input=False,
|
39 |
+
align_corners=False,
|
40 |
+
loss_decode=dict(
|
41 |
+
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)),
|
42 |
+
dict(
|
43 |
+
type='FCNHead',
|
44 |
+
in_channels=64,
|
45 |
+
channels=32,
|
46 |
+
num_convs=1,
|
47 |
+
num_classes=19,
|
48 |
+
in_index=-3,
|
49 |
+
norm_cfg=norm_cfg,
|
50 |
+
concat_input=False,
|
51 |
+
align_corners=False,
|
52 |
+
loss_decode=dict(
|
53 |
+
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)),
|
54 |
+
],
|
55 |
+
# model training and testing settings
|
56 |
+
train_cfg=dict(),
|
57 |
+
test_cfg=dict(mode='whole'))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/fcn_hr18.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# model settings
|
2 |
+
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
3 |
+
model = dict(
|
4 |
+
type='EncoderDecoder',
|
5 |
+
pretrained='open-mmlab://msra/hrnetv2_w18',
|
6 |
+
backbone=dict(
|
7 |
+
type='HRNet',
|
8 |
+
norm_cfg=norm_cfg,
|
9 |
+
norm_eval=False,
|
10 |
+
extra=dict(
|
11 |
+
stage1=dict(
|
12 |
+
num_modules=1,
|
13 |
+
num_branches=1,
|
14 |
+
block='BOTTLENECK',
|
15 |
+
num_blocks=(4, ),
|
16 |
+
num_channels=(64, )),
|
17 |
+
stage2=dict(
|
18 |
+
num_modules=1,
|
19 |
+
num_branches=2,
|
20 |
+
block='BASIC',
|
21 |
+
num_blocks=(4, 4),
|
22 |
+
num_channels=(18, 36)),
|
23 |
+
stage3=dict(
|
24 |
+
num_modules=4,
|
25 |
+
num_branches=3,
|
26 |
+
block='BASIC',
|
27 |
+
num_blocks=(4, 4, 4),
|
28 |
+
num_channels=(18, 36, 72)),
|
29 |
+
stage4=dict(
|
30 |
+
num_modules=3,
|
31 |
+
num_branches=4,
|
32 |
+
block='BASIC',
|
33 |
+
num_blocks=(4, 4, 4, 4),
|
34 |
+
num_channels=(18, 36, 72, 144)))),
|
35 |
+
decode_head=dict(
|
36 |
+
type='FCNHead',
|
37 |
+
in_channels=[18, 36, 72, 144],
|
38 |
+
in_index=(0, 1, 2, 3),
|
39 |
+
channels=sum([18, 36, 72, 144]),
|
40 |
+
input_transform='resize_concat',
|
41 |
+
kernel_size=1,
|
42 |
+
num_convs=1,
|
43 |
+
concat_input=False,
|
44 |
+
dropout_ratio=-1,
|
45 |
+
num_classes=19,
|
46 |
+
norm_cfg=norm_cfg,
|
47 |
+
align_corners=False,
|
48 |
+
loss_decode=dict(
|
49 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
50 |
+
# model training and testing settings
|
51 |
+
train_cfg=dict(),
|
52 |
+
test_cfg=dict(mode='whole'))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/fcn_r50-d8.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# model settings
|
2 |
+
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
3 |
+
model = dict(
|
4 |
+
type='EncoderDecoder',
|
5 |
+
pretrained='open-mmlab://resnet50_v1c',
|
6 |
+
backbone=dict(
|
7 |
+
type='ResNetV1c',
|
8 |
+
depth=50,
|
9 |
+
num_stages=4,
|
10 |
+
out_indices=(0, 1, 2, 3),
|
11 |
+
dilations=(1, 1, 2, 4),
|
12 |
+
strides=(1, 2, 1, 1),
|
13 |
+
norm_cfg=norm_cfg,
|
14 |
+
norm_eval=False,
|
15 |
+
style='pytorch',
|
16 |
+
contract_dilation=True),
|
17 |
+
decode_head=dict(
|
18 |
+
type='FCNHead',
|
19 |
+
in_channels=2048,
|
20 |
+
in_index=3,
|
21 |
+
channels=512,
|
22 |
+
num_convs=2,
|
23 |
+
concat_input=True,
|
24 |
+
dropout_ratio=0.1,
|
25 |
+
num_classes=19,
|
26 |
+
norm_cfg=norm_cfg,
|
27 |
+
align_corners=False,
|
28 |
+
loss_decode=dict(
|
29 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
30 |
+
auxiliary_head=dict(
|
31 |
+
type='FCNHead',
|
32 |
+
in_channels=1024,
|
33 |
+
in_index=2,
|
34 |
+
channels=256,
|
35 |
+
num_convs=1,
|
36 |
+
concat_input=False,
|
37 |
+
dropout_ratio=0.1,
|
38 |
+
num_classes=19,
|
39 |
+
norm_cfg=norm_cfg,
|
40 |
+
align_corners=False,
|
41 |
+
loss_decode=dict(
|
42 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
43 |
+
# model training and testing settings
|
44 |
+
train_cfg=dict(),
|
45 |
+
test_cfg=dict(mode='whole'))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/fcn_unet_s5-d16.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# model settings
|
2 |
+
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
3 |
+
model = dict(
|
4 |
+
type='EncoderDecoder',
|
5 |
+
pretrained=None,
|
6 |
+
backbone=dict(
|
7 |
+
type='UNet',
|
8 |
+
in_channels=3,
|
9 |
+
base_channels=64,
|
10 |
+
num_stages=5,
|
11 |
+
strides=(1, 1, 1, 1, 1),
|
12 |
+
enc_num_convs=(2, 2, 2, 2, 2),
|
13 |
+
dec_num_convs=(2, 2, 2, 2),
|
14 |
+
downsamples=(True, True, True, True),
|
15 |
+
enc_dilations=(1, 1, 1, 1, 1),
|
16 |
+
dec_dilations=(1, 1, 1, 1),
|
17 |
+
with_cp=False,
|
18 |
+
conv_cfg=None,
|
19 |
+
norm_cfg=norm_cfg,
|
20 |
+
act_cfg=dict(type='ReLU'),
|
21 |
+
upsample_cfg=dict(type='InterpConv'),
|
22 |
+
norm_eval=False),
|
23 |
+
decode_head=dict(
|
24 |
+
type='FCNHead',
|
25 |
+
in_channels=64,
|
26 |
+
in_index=4,
|
27 |
+
channels=64,
|
28 |
+
num_convs=1,
|
29 |
+
concat_input=False,
|
30 |
+
dropout_ratio=0.1,
|
31 |
+
num_classes=2,
|
32 |
+
norm_cfg=norm_cfg,
|
33 |
+
align_corners=False,
|
34 |
+
loss_decode=dict(
|
35 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
36 |
+
auxiliary_head=dict(
|
37 |
+
type='FCNHead',
|
38 |
+
in_channels=128,
|
39 |
+
in_index=3,
|
40 |
+
channels=64,
|
41 |
+
num_convs=1,
|
42 |
+
concat_input=False,
|
43 |
+
dropout_ratio=0.1,
|
44 |
+
num_classes=2,
|
45 |
+
norm_cfg=norm_cfg,
|
46 |
+
align_corners=False,
|
47 |
+
loss_decode=dict(
|
48 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
49 |
+
# model training and testing settings
|
50 |
+
train_cfg=dict(),
|
51 |
+
test_cfg=dict(mode='slide', crop_size=256, stride=170))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/fpn_r50.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# model settings
|
2 |
+
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
3 |
+
model = dict(
|
4 |
+
type='EncoderDecoder',
|
5 |
+
pretrained='open-mmlab://resnet50_v1c',
|
6 |
+
backbone=dict(
|
7 |
+
type='ResNetV1c',
|
8 |
+
depth=50,
|
9 |
+
num_stages=4,
|
10 |
+
out_indices=(0, 1, 2, 3),
|
11 |
+
dilations=(1, 1, 1, 1),
|
12 |
+
strides=(1, 2, 2, 2),
|
13 |
+
norm_cfg=norm_cfg,
|
14 |
+
norm_eval=False,
|
15 |
+
style='pytorch',
|
16 |
+
contract_dilation=True),
|
17 |
+
neck=dict(
|
18 |
+
type='FPN',
|
19 |
+
in_channels=[256, 512, 1024, 2048],
|
20 |
+
out_channels=256,
|
21 |
+
num_outs=4),
|
22 |
+
decode_head=dict(
|
23 |
+
type='FPNHead',
|
24 |
+
in_channels=[256, 256, 256, 256],
|
25 |
+
in_index=[0, 1, 2, 3],
|
26 |
+
feature_strides=[4, 8, 16, 32],
|
27 |
+
channels=128,
|
28 |
+
dropout_ratio=0.1,
|
29 |
+
num_classes=19,
|
30 |
+
norm_cfg=norm_cfg,
|
31 |
+
align_corners=False,
|
32 |
+
loss_decode=dict(
|
33 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
34 |
+
# model training and testing settings
|
35 |
+
train_cfg=dict(),
|
36 |
+
test_cfg=dict(mode='whole'))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/fpn_uniformer.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# model settings
|
2 |
+
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
3 |
+
model = dict(
|
4 |
+
type='EncoderDecoder',
|
5 |
+
backbone=dict(
|
6 |
+
type='UniFormer',
|
7 |
+
embed_dim=[64, 128, 320, 512],
|
8 |
+
layers=[3, 4, 8, 3],
|
9 |
+
head_dim=64,
|
10 |
+
mlp_ratio=4.,
|
11 |
+
qkv_bias=True,
|
12 |
+
drop_rate=0.,
|
13 |
+
attn_drop_rate=0.,
|
14 |
+
drop_path_rate=0.1),
|
15 |
+
neck=dict(
|
16 |
+
type='FPN',
|
17 |
+
in_channels=[64, 128, 320, 512],
|
18 |
+
out_channels=256,
|
19 |
+
num_outs=4),
|
20 |
+
decode_head=dict(
|
21 |
+
type='FPNHead',
|
22 |
+
in_channels=[256, 256, 256, 256],
|
23 |
+
in_index=[0, 1, 2, 3],
|
24 |
+
feature_strides=[4, 8, 16, 32],
|
25 |
+
channels=128,
|
26 |
+
dropout_ratio=0.1,
|
27 |
+
num_classes=150,
|
28 |
+
norm_cfg=norm_cfg,
|
29 |
+
align_corners=False,
|
30 |
+
loss_decode=dict(
|
31 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
32 |
+
# model training and testing settings
|
33 |
+
train_cfg=dict(),
|
34 |
+
test_cfg=dict(mode='whole')
|
35 |
+
)
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/gcnet_r50-d8.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# model settings
|
2 |
+
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
3 |
+
model = dict(
|
4 |
+
type='EncoderDecoder',
|
5 |
+
pretrained='open-mmlab://resnet50_v1c',
|
6 |
+
backbone=dict(
|
7 |
+
type='ResNetV1c',
|
8 |
+
depth=50,
|
9 |
+
num_stages=4,
|
10 |
+
out_indices=(0, 1, 2, 3),
|
11 |
+
dilations=(1, 1, 2, 4),
|
12 |
+
strides=(1, 2, 1, 1),
|
13 |
+
norm_cfg=norm_cfg,
|
14 |
+
norm_eval=False,
|
15 |
+
style='pytorch',
|
16 |
+
contract_dilation=True),
|
17 |
+
decode_head=dict(
|
18 |
+
type='GCHead',
|
19 |
+
in_channels=2048,
|
20 |
+
in_index=3,
|
21 |
+
channels=512,
|
22 |
+
ratio=1 / 4.,
|
23 |
+
pooling_type='att',
|
24 |
+
fusion_types=('channel_add', ),
|
25 |
+
dropout_ratio=0.1,
|
26 |
+
num_classes=19,
|
27 |
+
norm_cfg=norm_cfg,
|
28 |
+
align_corners=False,
|
29 |
+
loss_decode=dict(
|
30 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
31 |
+
auxiliary_head=dict(
|
32 |
+
type='FCNHead',
|
33 |
+
in_channels=1024,
|
34 |
+
in_index=2,
|
35 |
+
channels=256,
|
36 |
+
num_convs=1,
|
37 |
+
concat_input=False,
|
38 |
+
dropout_ratio=0.1,
|
39 |
+
num_classes=19,
|
40 |
+
norm_cfg=norm_cfg,
|
41 |
+
align_corners=False,
|
42 |
+
loss_decode=dict(
|
43 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
44 |
+
# model training and testing settings
|
45 |
+
train_cfg=dict(),
|
46 |
+
test_cfg=dict(mode='whole'))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/lraspp_m-v3-d8.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# model settings
|
2 |
+
norm_cfg = dict(type='SyncBN', eps=0.001, requires_grad=True)
|
3 |
+
model = dict(
|
4 |
+
type='EncoderDecoder',
|
5 |
+
backbone=dict(
|
6 |
+
type='MobileNetV3',
|
7 |
+
arch='large',
|
8 |
+
out_indices=(1, 3, 16),
|
9 |
+
norm_cfg=norm_cfg),
|
10 |
+
decode_head=dict(
|
11 |
+
type='LRASPPHead',
|
12 |
+
in_channels=(16, 24, 960),
|
13 |
+
in_index=(0, 1, 2),
|
14 |
+
channels=128,
|
15 |
+
input_transform='multiple_select',
|
16 |
+
dropout_ratio=0.1,
|
17 |
+
num_classes=19,
|
18 |
+
norm_cfg=norm_cfg,
|
19 |
+
act_cfg=dict(type='ReLU'),
|
20 |
+
align_corners=False,
|
21 |
+
loss_decode=dict(
|
22 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
23 |
+
# model training and testing settings
|
24 |
+
train_cfg=dict(),
|
25 |
+
test_cfg=dict(mode='whole'))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/nonlocal_r50-d8.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# model settings
|
2 |
+
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
3 |
+
model = dict(
|
4 |
+
type='EncoderDecoder',
|
5 |
+
pretrained='open-mmlab://resnet50_v1c',
|
6 |
+
backbone=dict(
|
7 |
+
type='ResNetV1c',
|
8 |
+
depth=50,
|
9 |
+
num_stages=4,
|
10 |
+
out_indices=(0, 1, 2, 3),
|
11 |
+
dilations=(1, 1, 2, 4),
|
12 |
+
strides=(1, 2, 1, 1),
|
13 |
+
norm_cfg=norm_cfg,
|
14 |
+
norm_eval=False,
|
15 |
+
style='pytorch',
|
16 |
+
contract_dilation=True),
|
17 |
+
decode_head=dict(
|
18 |
+
type='NLHead',
|
19 |
+
in_channels=2048,
|
20 |
+
in_index=3,
|
21 |
+
channels=512,
|
22 |
+
dropout_ratio=0.1,
|
23 |
+
reduction=2,
|
24 |
+
use_scale=True,
|
25 |
+
mode='embedded_gaussian',
|
26 |
+
num_classes=19,
|
27 |
+
norm_cfg=norm_cfg,
|
28 |
+
align_corners=False,
|
29 |
+
loss_decode=dict(
|
30 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
31 |
+
auxiliary_head=dict(
|
32 |
+
type='FCNHead',
|
33 |
+
in_channels=1024,
|
34 |
+
in_index=2,
|
35 |
+
channels=256,
|
36 |
+
num_convs=1,
|
37 |
+
concat_input=False,
|
38 |
+
dropout_ratio=0.1,
|
39 |
+
num_classes=19,
|
40 |
+
norm_cfg=norm_cfg,
|
41 |
+
align_corners=False,
|
42 |
+
loss_decode=dict(
|
43 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
44 |
+
# model training and testing settings
|
45 |
+
train_cfg=dict(),
|
46 |
+
test_cfg=dict(mode='whole'))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/ocrnet_hr18.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# model settings
|
2 |
+
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
3 |
+
model = dict(
|
4 |
+
type='CascadeEncoderDecoder',
|
5 |
+
num_stages=2,
|
6 |
+
pretrained='open-mmlab://msra/hrnetv2_w18',
|
7 |
+
backbone=dict(
|
8 |
+
type='HRNet',
|
9 |
+
norm_cfg=norm_cfg,
|
10 |
+
norm_eval=False,
|
11 |
+
extra=dict(
|
12 |
+
stage1=dict(
|
13 |
+
num_modules=1,
|
14 |
+
num_branches=1,
|
15 |
+
block='BOTTLENECK',
|
16 |
+
num_blocks=(4, ),
|
17 |
+
num_channels=(64, )),
|
18 |
+
stage2=dict(
|
19 |
+
num_modules=1,
|
20 |
+
num_branches=2,
|
21 |
+
block='BASIC',
|
22 |
+
num_blocks=(4, 4),
|
23 |
+
num_channels=(18, 36)),
|
24 |
+
stage3=dict(
|
25 |
+
num_modules=4,
|
26 |
+
num_branches=3,
|
27 |
+
block='BASIC',
|
28 |
+
num_blocks=(4, 4, 4),
|
29 |
+
num_channels=(18, 36, 72)),
|
30 |
+
stage4=dict(
|
31 |
+
num_modules=3,
|
32 |
+
num_branches=4,
|
33 |
+
block='BASIC',
|
34 |
+
num_blocks=(4, 4, 4, 4),
|
35 |
+
num_channels=(18, 36, 72, 144)))),
|
36 |
+
decode_head=[
|
37 |
+
dict(
|
38 |
+
type='FCNHead',
|
39 |
+
in_channels=[18, 36, 72, 144],
|
40 |
+
channels=sum([18, 36, 72, 144]),
|
41 |
+
in_index=(0, 1, 2, 3),
|
42 |
+
input_transform='resize_concat',
|
43 |
+
kernel_size=1,
|
44 |
+
num_convs=1,
|
45 |
+
concat_input=False,
|
46 |
+
dropout_ratio=-1,
|
47 |
+
num_classes=19,
|
48 |
+
norm_cfg=norm_cfg,
|
49 |
+
align_corners=False,
|
50 |
+
loss_decode=dict(
|
51 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
52 |
+
dict(
|
53 |
+
type='OCRHead',
|
54 |
+
in_channels=[18, 36, 72, 144],
|
55 |
+
in_index=(0, 1, 2, 3),
|
56 |
+
input_transform='resize_concat',
|
57 |
+
channels=512,
|
58 |
+
ocr_channels=256,
|
59 |
+
dropout_ratio=-1,
|
60 |
+
num_classes=19,
|
61 |
+
norm_cfg=norm_cfg,
|
62 |
+
align_corners=False,
|
63 |
+
loss_decode=dict(
|
64 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
65 |
+
],
|
66 |
+
# model training and testing settings
|
67 |
+
train_cfg=dict(),
|
68 |
+
test_cfg=dict(mode='whole'))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/ocrnet_r50-d8.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# model settings
|
2 |
+
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
3 |
+
model = dict(
|
4 |
+
type='CascadeEncoderDecoder',
|
5 |
+
num_stages=2,
|
6 |
+
pretrained='open-mmlab://resnet50_v1c',
|
7 |
+
backbone=dict(
|
8 |
+
type='ResNetV1c',
|
9 |
+
depth=50,
|
10 |
+
num_stages=4,
|
11 |
+
out_indices=(0, 1, 2, 3),
|
12 |
+
dilations=(1, 1, 2, 4),
|
13 |
+
strides=(1, 2, 1, 1),
|
14 |
+
norm_cfg=norm_cfg,
|
15 |
+
norm_eval=False,
|
16 |
+
style='pytorch',
|
17 |
+
contract_dilation=True),
|
18 |
+
decode_head=[
|
19 |
+
dict(
|
20 |
+
type='FCNHead',
|
21 |
+
in_channels=1024,
|
22 |
+
in_index=2,
|
23 |
+
channels=256,
|
24 |
+
num_convs=1,
|
25 |
+
concat_input=False,
|
26 |
+
dropout_ratio=0.1,
|
27 |
+
num_classes=19,
|
28 |
+
norm_cfg=norm_cfg,
|
29 |
+
align_corners=False,
|
30 |
+
loss_decode=dict(
|
31 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
32 |
+
dict(
|
33 |
+
type='OCRHead',
|
34 |
+
in_channels=2048,
|
35 |
+
in_index=3,
|
36 |
+
channels=512,
|
37 |
+
ocr_channels=256,
|
38 |
+
dropout_ratio=0.1,
|
39 |
+
num_classes=19,
|
40 |
+
norm_cfg=norm_cfg,
|
41 |
+
align_corners=False,
|
42 |
+
loss_decode=dict(
|
43 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
|
44 |
+
],
|
45 |
+
# model training and testing settings
|
46 |
+
train_cfg=dict(),
|
47 |
+
test_cfg=dict(mode='whole'))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/pointrend_r50.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# model settings
|
2 |
+
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
3 |
+
model = dict(
|
4 |
+
type='CascadeEncoderDecoder',
|
5 |
+
num_stages=2,
|
6 |
+
pretrained='open-mmlab://resnet50_v1c',
|
7 |
+
backbone=dict(
|
8 |
+
type='ResNetV1c',
|
9 |
+
depth=50,
|
10 |
+
num_stages=4,
|
11 |
+
out_indices=(0, 1, 2, 3),
|
12 |
+
dilations=(1, 1, 1, 1),
|
13 |
+
strides=(1, 2, 2, 2),
|
14 |
+
norm_cfg=norm_cfg,
|
15 |
+
norm_eval=False,
|
16 |
+
style='pytorch',
|
17 |
+
contract_dilation=True),
|
18 |
+
neck=dict(
|
19 |
+
type='FPN',
|
20 |
+
in_channels=[256, 512, 1024, 2048],
|
21 |
+
out_channels=256,
|
22 |
+
num_outs=4),
|
23 |
+
decode_head=[
|
24 |
+
dict(
|
25 |
+
type='FPNHead',
|
26 |
+
in_channels=[256, 256, 256, 256],
|
27 |
+
in_index=[0, 1, 2, 3],
|
28 |
+
feature_strides=[4, 8, 16, 32],
|
29 |
+
channels=128,
|
30 |
+
dropout_ratio=-1,
|
31 |
+
num_classes=19,
|
32 |
+
norm_cfg=norm_cfg,
|
33 |
+
align_corners=False,
|
34 |
+
loss_decode=dict(
|
35 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
36 |
+
dict(
|
37 |
+
type='PointHead',
|
38 |
+
in_channels=[256],
|
39 |
+
in_index=[0],
|
40 |
+
channels=256,
|
41 |
+
num_fcs=3,
|
42 |
+
coarse_pred_each_layer=True,
|
43 |
+
dropout_ratio=-1,
|
44 |
+
num_classes=19,
|
45 |
+
align_corners=False,
|
46 |
+
loss_decode=dict(
|
47 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
|
48 |
+
],
|
49 |
+
# model training and testing settings
|
50 |
+
train_cfg=dict(
|
51 |
+
num_points=2048, oversample_ratio=3, importance_sample_ratio=0.75),
|
52 |
+
test_cfg=dict(
|
53 |
+
mode='whole',
|
54 |
+
subdivision_steps=2,
|
55 |
+
subdivision_num_points=8196,
|
56 |
+
scale_factor=2))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/psanet_r50-d8.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# model settings
|
2 |
+
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
3 |
+
model = dict(
|
4 |
+
type='EncoderDecoder',
|
5 |
+
pretrained='open-mmlab://resnet50_v1c',
|
6 |
+
backbone=dict(
|
7 |
+
type='ResNetV1c',
|
8 |
+
depth=50,
|
9 |
+
num_stages=4,
|
10 |
+
out_indices=(0, 1, 2, 3),
|
11 |
+
dilations=(1, 1, 2, 4),
|
12 |
+
strides=(1, 2, 1, 1),
|
13 |
+
norm_cfg=norm_cfg,
|
14 |
+
norm_eval=False,
|
15 |
+
style='pytorch',
|
16 |
+
contract_dilation=True),
|
17 |
+
decode_head=dict(
|
18 |
+
type='PSAHead',
|
19 |
+
in_channels=2048,
|
20 |
+
in_index=3,
|
21 |
+
channels=512,
|
22 |
+
mask_size=(97, 97),
|
23 |
+
psa_type='bi-direction',
|
24 |
+
compact=False,
|
25 |
+
shrink_factor=2,
|
26 |
+
normalization_factor=1.0,
|
27 |
+
psa_softmax=True,
|
28 |
+
dropout_ratio=0.1,
|
29 |
+
num_classes=19,
|
30 |
+
norm_cfg=norm_cfg,
|
31 |
+
align_corners=False,
|
32 |
+
loss_decode=dict(
|
33 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
34 |
+
auxiliary_head=dict(
|
35 |
+
type='FCNHead',
|
36 |
+
in_channels=1024,
|
37 |
+
in_index=2,
|
38 |
+
channels=256,
|
39 |
+
num_convs=1,
|
40 |
+
concat_input=False,
|
41 |
+
dropout_ratio=0.1,
|
42 |
+
num_classes=19,
|
43 |
+
norm_cfg=norm_cfg,
|
44 |
+
align_corners=False,
|
45 |
+
loss_decode=dict(
|
46 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
47 |
+
# model training and testing settings
|
48 |
+
train_cfg=dict(),
|
49 |
+
test_cfg=dict(mode='whole'))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/pspnet_r50-d8.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# model settings
|
2 |
+
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
3 |
+
model = dict(
|
4 |
+
type='EncoderDecoder',
|
5 |
+
pretrained='open-mmlab://resnet50_v1c',
|
6 |
+
backbone=dict(
|
7 |
+
type='ResNetV1c',
|
8 |
+
depth=50,
|
9 |
+
num_stages=4,
|
10 |
+
out_indices=(0, 1, 2, 3),
|
11 |
+
dilations=(1, 1, 2, 4),
|
12 |
+
strides=(1, 2, 1, 1),
|
13 |
+
norm_cfg=norm_cfg,
|
14 |
+
norm_eval=False,
|
15 |
+
style='pytorch',
|
16 |
+
contract_dilation=True),
|
17 |
+
decode_head=dict(
|
18 |
+
type='PSPHead',
|
19 |
+
in_channels=2048,
|
20 |
+
in_index=3,
|
21 |
+
channels=512,
|
22 |
+
pool_scales=(1, 2, 3, 6),
|
23 |
+
dropout_ratio=0.1,
|
24 |
+
num_classes=19,
|
25 |
+
norm_cfg=norm_cfg,
|
26 |
+
align_corners=False,
|
27 |
+
loss_decode=dict(
|
28 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
29 |
+
auxiliary_head=dict(
|
30 |
+
type='FCNHead',
|
31 |
+
in_channels=1024,
|
32 |
+
in_index=2,
|
33 |
+
channels=256,
|
34 |
+
num_convs=1,
|
35 |
+
concat_input=False,
|
36 |
+
dropout_ratio=0.1,
|
37 |
+
num_classes=19,
|
38 |
+
norm_cfg=norm_cfg,
|
39 |
+
align_corners=False,
|
40 |
+
loss_decode=dict(
|
41 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
42 |
+
# model training and testing settings
|
43 |
+
train_cfg=dict(),
|
44 |
+
test_cfg=dict(mode='whole'))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/pspnet_unet_s5-d16.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# model settings
|
2 |
+
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
3 |
+
model = dict(
|
4 |
+
type='EncoderDecoder',
|
5 |
+
pretrained=None,
|
6 |
+
backbone=dict(
|
7 |
+
type='UNet',
|
8 |
+
in_channels=3,
|
9 |
+
base_channels=64,
|
10 |
+
num_stages=5,
|
11 |
+
strides=(1, 1, 1, 1, 1),
|
12 |
+
enc_num_convs=(2, 2, 2, 2, 2),
|
13 |
+
dec_num_convs=(2, 2, 2, 2),
|
14 |
+
downsamples=(True, True, True, True),
|
15 |
+
enc_dilations=(1, 1, 1, 1, 1),
|
16 |
+
dec_dilations=(1, 1, 1, 1),
|
17 |
+
with_cp=False,
|
18 |
+
conv_cfg=None,
|
19 |
+
norm_cfg=norm_cfg,
|
20 |
+
act_cfg=dict(type='ReLU'),
|
21 |
+
upsample_cfg=dict(type='InterpConv'),
|
22 |
+
norm_eval=False),
|
23 |
+
decode_head=dict(
|
24 |
+
type='PSPHead',
|
25 |
+
in_channels=64,
|
26 |
+
in_index=4,
|
27 |
+
channels=16,
|
28 |
+
pool_scales=(1, 2, 3, 6),
|
29 |
+
dropout_ratio=0.1,
|
30 |
+
num_classes=2,
|
31 |
+
norm_cfg=norm_cfg,
|
32 |
+
align_corners=False,
|
33 |
+
loss_decode=dict(
|
34 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
35 |
+
auxiliary_head=dict(
|
36 |
+
type='FCNHead',
|
37 |
+
in_channels=128,
|
38 |
+
in_index=3,
|
39 |
+
channels=64,
|
40 |
+
num_convs=1,
|
41 |
+
concat_input=False,
|
42 |
+
dropout_ratio=0.1,
|
43 |
+
num_classes=2,
|
44 |
+
norm_cfg=norm_cfg,
|
45 |
+
align_corners=False,
|
46 |
+
loss_decode=dict(
|
47 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
48 |
+
# model training and testing settings
|
49 |
+
train_cfg=dict(),
|
50 |
+
test_cfg=dict(mode='slide', crop_size=256, stride=170))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/upernet_r50.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# model settings
|
2 |
+
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
3 |
+
model = dict(
|
4 |
+
type='EncoderDecoder',
|
5 |
+
pretrained='open-mmlab://resnet50_v1c',
|
6 |
+
backbone=dict(
|
7 |
+
type='ResNetV1c',
|
8 |
+
depth=50,
|
9 |
+
num_stages=4,
|
10 |
+
out_indices=(0, 1, 2, 3),
|
11 |
+
dilations=(1, 1, 1, 1),
|
12 |
+
strides=(1, 2, 2, 2),
|
13 |
+
norm_cfg=norm_cfg,
|
14 |
+
norm_eval=False,
|
15 |
+
style='pytorch',
|
16 |
+
contract_dilation=True),
|
17 |
+
decode_head=dict(
|
18 |
+
type='UPerHead',
|
19 |
+
in_channels=[256, 512, 1024, 2048],
|
20 |
+
in_index=[0, 1, 2, 3],
|
21 |
+
pool_scales=(1, 2, 3, 6),
|
22 |
+
channels=512,
|
23 |
+
dropout_ratio=0.1,
|
24 |
+
num_classes=19,
|
25 |
+
norm_cfg=norm_cfg,
|
26 |
+
align_corners=False,
|
27 |
+
loss_decode=dict(
|
28 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
29 |
+
auxiliary_head=dict(
|
30 |
+
type='FCNHead',
|
31 |
+
in_channels=1024,
|
32 |
+
in_index=2,
|
33 |
+
channels=256,
|
34 |
+
num_convs=1,
|
35 |
+
concat_input=False,
|
36 |
+
dropout_ratio=0.1,
|
37 |
+
num_classes=19,
|
38 |
+
norm_cfg=norm_cfg,
|
39 |
+
align_corners=False,
|
40 |
+
loss_decode=dict(
|
41 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
42 |
+
# model training and testing settings
|
43 |
+
train_cfg=dict(),
|
44 |
+
test_cfg=dict(mode='whole'))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/upernet_uniformer.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# model settings
|
2 |
+
norm_cfg = dict(type='BN', requires_grad=True)
|
3 |
+
model = dict(
|
4 |
+
type='EncoderDecoder',
|
5 |
+
pretrained=None,
|
6 |
+
backbone=dict(
|
7 |
+
type='UniFormer',
|
8 |
+
embed_dim=[64, 128, 320, 512],
|
9 |
+
layers=[3, 4, 8, 3],
|
10 |
+
head_dim=64,
|
11 |
+
mlp_ratio=4.,
|
12 |
+
qkv_bias=True,
|
13 |
+
drop_rate=0.,
|
14 |
+
attn_drop_rate=0.,
|
15 |
+
drop_path_rate=0.1),
|
16 |
+
decode_head=dict(
|
17 |
+
type='UPerHead',
|
18 |
+
in_channels=[64, 128, 320, 512],
|
19 |
+
in_index=[0, 1, 2, 3],
|
20 |
+
pool_scales=(1, 2, 3, 6),
|
21 |
+
channels=512,
|
22 |
+
dropout_ratio=0.1,
|
23 |
+
num_classes=19,
|
24 |
+
norm_cfg=norm_cfg,
|
25 |
+
align_corners=False,
|
26 |
+
loss_decode=dict(
|
27 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
28 |
+
auxiliary_head=dict(
|
29 |
+
type='FCNHead',
|
30 |
+
in_channels=320,
|
31 |
+
in_index=2,
|
32 |
+
channels=256,
|
33 |
+
num_convs=1,
|
34 |
+
concat_input=False,
|
35 |
+
dropout_ratio=0.1,
|
36 |
+
num_classes=19,
|
37 |
+
norm_cfg=norm_cfg,
|
38 |
+
align_corners=False,
|
39 |
+
loss_decode=dict(
|
40 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
41 |
+
# model training and testing settings
|
42 |
+
train_cfg=dict(),
|
43 |
+
test_cfg=dict(mode='whole'))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/schedules/schedule_160k.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# optimizer
|
2 |
+
optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
|
3 |
+
optimizer_config = dict()
|
4 |
+
# learning policy
|
5 |
+
lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
|
6 |
+
# runtime settings
|
7 |
+
runner = dict(type='IterBasedRunner', max_iters=160000)
|
8 |
+
checkpoint_config = dict(by_epoch=False, interval=16000)
|
9 |
+
evaluation = dict(interval=16000, metric='mIoU')
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/schedules/schedule_20k.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# optimizer
|
2 |
+
optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
|
3 |
+
optimizer_config = dict()
|
4 |
+
# learning policy
|
5 |
+
lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
|
6 |
+
# runtime settings
|
7 |
+
runner = dict(type='IterBasedRunner', max_iters=20000)
|
8 |
+
checkpoint_config = dict(by_epoch=False, interval=2000)
|
9 |
+
evaluation = dict(interval=2000, metric='mIoU')
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/schedules/schedule_40k.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# optimizer
|
2 |
+
optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
|
3 |
+
optimizer_config = dict()
|
4 |
+
# learning policy
|
5 |
+
lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
|
6 |
+
# runtime settings
|
7 |
+
runner = dict(type='IterBasedRunner', max_iters=40000)
|
8 |
+
checkpoint_config = dict(by_epoch=False, interval=4000)
|
9 |
+
evaluation = dict(interval=4000, metric='mIoU')
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/schedules/schedule_80k.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# optimizer
|
2 |
+
optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
|
3 |
+
optimizer_config = dict()
|
4 |
+
# learning policy
|
5 |
+
lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
|
6 |
+
# runtime settings
|
7 |
+
runner = dict(type='IterBasedRunner', max_iters=80000)
|
8 |
+
checkpoint_config = dict(by_epoch=False, interval=8000)
|
9 |
+
evaluation = dict(interval=8000, metric='mIoU')
|
extensions/microsoftexcel-controlnet/annotator/uniformer/inference.py
ADDED
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import torch
|
3 |
+
|
4 |
+
try:
|
5 |
+
import mmcv as mmcv
|
6 |
+
from mmcv.parallel import collate, scatter
|
7 |
+
from mmcv.runner import load_checkpoint
|
8 |
+
from mmseg.datasets.pipelines import Compose
|
9 |
+
from mmseg.models import build_segmentor
|
10 |
+
except ImportError:
|
11 |
+
import annotator.mmpkg.mmcv as mmcv
|
12 |
+
from annotator.mmpkg.mmcv.parallel import collate, scatter
|
13 |
+
from annotator.mmpkg.mmcv.runner import load_checkpoint
|
14 |
+
from annotator.mmpkg.mmseg.datasets.pipelines import Compose
|
15 |
+
from annotator.mmpkg.mmseg.models import build_segmentor
|
16 |
+
|
17 |
+
def init_segmentor(config, checkpoint=None, device='cuda:0'):
|
18 |
+
"""Initialize a segmentor from config file.
|
19 |
+
|
20 |
+
Args:
|
21 |
+
config (str or :obj:`mmcv.Config`): Config file path or the config
|
22 |
+
object.
|
23 |
+
checkpoint (str, optional): Checkpoint path. If left as None, the model
|
24 |
+
will not load any weights.
|
25 |
+
device (str, optional) CPU/CUDA device option. Default 'cuda:0'.
|
26 |
+
Use 'cpu' for loading model on CPU.
|
27 |
+
Returns:
|
28 |
+
nn.Module: The constructed segmentor.
|
29 |
+
"""
|
30 |
+
if isinstance(config, str):
|
31 |
+
config = mmcv.Config.fromfile(config)
|
32 |
+
elif not isinstance(config, mmcv.Config):
|
33 |
+
raise TypeError('config must be a filename or Config object, '
|
34 |
+
'but got {}'.format(type(config)))
|
35 |
+
config.model.pretrained = None
|
36 |
+
config.model.train_cfg = None
|
37 |
+
model = build_segmentor(config.model, test_cfg=config.get('test_cfg'))
|
38 |
+
if checkpoint is not None:
|
39 |
+
checkpoint = load_checkpoint(model, checkpoint, map_location='cpu')
|
40 |
+
model.CLASSES = checkpoint['meta']['CLASSES']
|
41 |
+
model.PALETTE = checkpoint['meta']['PALETTE']
|
42 |
+
model.cfg = config # save the config in the model for convenience
|
43 |
+
model.to(device)
|
44 |
+
model.eval()
|
45 |
+
return model
|
46 |
+
|
47 |
+
|
48 |
+
class LoadImage:
|
49 |
+
"""A simple pipeline to load image."""
|
50 |
+
|
51 |
+
def __call__(self, results):
|
52 |
+
"""Call function to load images into results.
|
53 |
+
|
54 |
+
Args:
|
55 |
+
results (dict): A result dict contains the file name
|
56 |
+
of the image to be read.
|
57 |
+
|
58 |
+
Returns:
|
59 |
+
dict: ``results`` will be returned containing loaded image.
|
60 |
+
"""
|
61 |
+
|
62 |
+
if isinstance(results['img'], str):
|
63 |
+
results['filename'] = results['img']
|
64 |
+
results['ori_filename'] = results['img']
|
65 |
+
else:
|
66 |
+
results['filename'] = None
|
67 |
+
results['ori_filename'] = None
|
68 |
+
img = mmcv.imread(results['img'])
|
69 |
+
results['img'] = img
|
70 |
+
results['img_shape'] = img.shape
|
71 |
+
results['ori_shape'] = img.shape
|
72 |
+
return results
|
73 |
+
|
74 |
+
|
75 |
+
def inference_segmentor(model, img):
|
76 |
+
"""Inference image(s) with the segmentor.
|
77 |
+
|
78 |
+
Args:
|
79 |
+
model (nn.Module): The loaded segmentor.
|
80 |
+
imgs (str/ndarray or list[str/ndarray]): Either image files or loaded
|
81 |
+
images.
|
82 |
+
|
83 |
+
Returns:
|
84 |
+
(list[Tensor]): The segmentation result.
|
85 |
+
"""
|
86 |
+
cfg = model.cfg
|
87 |
+
device = next(model.parameters()).device # model device
|
88 |
+
# build the data pipeline
|
89 |
+
test_pipeline = [LoadImage()] + cfg.data.test.pipeline[1:]
|
90 |
+
test_pipeline = Compose(test_pipeline)
|
91 |
+
# prepare data
|
92 |
+
data = dict(img=img)
|
93 |
+
data = test_pipeline(data)
|
94 |
+
data = collate([data], samples_per_gpu=1)
|
95 |
+
if next(model.parameters()).is_cuda:
|
96 |
+
# scatter to specified GPU
|
97 |
+
data = scatter(data, [device])[0]
|
98 |
+
else:
|
99 |
+
data['img_metas'] = [i.data[0] for i in data['img_metas']]
|
100 |
+
|
101 |
+
data['img'] = [x.to(device) for x in data['img']]
|
102 |
+
|
103 |
+
# forward the model
|
104 |
+
with torch.no_grad():
|
105 |
+
result = model(return_loss=False, rescale=True, **data)
|
106 |
+
return result
|
107 |
+
|
108 |
+
|
109 |
+
def show_result_pyplot(model,
|
110 |
+
img,
|
111 |
+
result,
|
112 |
+
palette=None,
|
113 |
+
fig_size=(15, 10),
|
114 |
+
opacity=0.5,
|
115 |
+
title='',
|
116 |
+
block=True):
|
117 |
+
"""Visualize the segmentation results on the image.
|
118 |
+
|
119 |
+
Args:
|
120 |
+
model (nn.Module): The loaded segmentor.
|
121 |
+
img (str or np.ndarray): Image filename or loaded image.
|
122 |
+
result (list): The segmentation result.
|
123 |
+
palette (list[list[int]]] | None): The palette of segmentation
|
124 |
+
map. If None is given, random palette will be generated.
|
125 |
+
Default: None
|
126 |
+
fig_size (tuple): Figure size of the pyplot figure.
|
127 |
+
opacity(float): Opacity of painted segmentation map.
|
128 |
+
Default 0.5.
|
129 |
+
Must be in (0, 1] range.
|
130 |
+
title (str): The title of pyplot figure.
|
131 |
+
Default is ''.
|
132 |
+
block (bool): Whether to block the pyplot figure.
|
133 |
+
Default is True.
|
134 |
+
"""
|
135 |
+
if hasattr(model, 'module'):
|
136 |
+
model = model.module
|
137 |
+
img = model.show_result(
|
138 |
+
img, result, palette=palette, show=False, opacity=opacity)
|
139 |
+
# plt.figure(figsize=fig_size)
|
140 |
+
# plt.imshow(mmcv.bgr2rgb(img))
|
141 |
+
# plt.title(title)
|
142 |
+
# plt.tight_layout()
|
143 |
+
# plt.show(block=block)
|
144 |
+
return mmcv.bgr2rgb(img)
|
extensions/microsoftexcel-controlnet/annotator/uniformer/mmcv_custom/__init__.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
from .checkpoint import load_checkpoint
|
4 |
+
|
5 |
+
__all__ = ['load_checkpoint']
|
extensions/microsoftexcel-controlnet/annotator/uniformer/mmcv_custom/checkpoint.py
ADDED
@@ -0,0 +1,508 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Open-MMLab. All rights reserved.
|
2 |
+
import io
|
3 |
+
import os
|
4 |
+
import os.path as osp
|
5 |
+
import pkgutil
|
6 |
+
import time
|
7 |
+
import warnings
|
8 |
+
from collections import OrderedDict
|
9 |
+
from importlib import import_module
|
10 |
+
from tempfile import TemporaryDirectory
|
11 |
+
|
12 |
+
import torch
|
13 |
+
import torchvision
|
14 |
+
from torch.optim import Optimizer
|
15 |
+
from torch.utils import model_zoo
|
16 |
+
from torch.nn import functional as F
|
17 |
+
|
18 |
+
try:
|
19 |
+
import mmcv as mmcv
|
20 |
+
from mmcv.fileio import FileClient
|
21 |
+
from mmcv.fileio import load as load_file
|
22 |
+
from mmcv.parallel import is_module_wrapper
|
23 |
+
from mmcv.utils import mkdir_or_exist
|
24 |
+
from mmcv.runner import get_dist_info
|
25 |
+
except ImportError:
|
26 |
+
import annotator.mmpkg.mmcv as mmcv
|
27 |
+
from annotator.mmpkg.mmcv.fileio import FileClient
|
28 |
+
from annotator.mmpkg.mmcv.fileio import load as load_file
|
29 |
+
from annotator.mmpkg.mmcv.parallel import is_module_wrapper
|
30 |
+
from annotator.mmpkg.mmcv.utils import mkdir_or_exist
|
31 |
+
from annotator.mmpkg.mmcv.runner import get_dist_info
|
32 |
+
|
33 |
+
ENV_MMCV_HOME = 'MMCV_HOME'
|
34 |
+
ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME'
|
35 |
+
DEFAULT_CACHE_DIR = '~/.cache'
|
36 |
+
|
37 |
+
|
38 |
+
def _get_mmcv_home():
|
39 |
+
mmcv_home = os.path.expanduser(
|
40 |
+
os.getenv(
|
41 |
+
ENV_MMCV_HOME,
|
42 |
+
os.path.join(
|
43 |
+
os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), 'mmcv')))
|
44 |
+
|
45 |
+
mkdir_or_exist(mmcv_home)
|
46 |
+
return mmcv_home
|
47 |
+
|
48 |
+
|
49 |
+
def load_state_dict(module, state_dict, strict=False, logger=None):
|
50 |
+
"""Load state_dict to a module.
|
51 |
+
|
52 |
+
This method is modified from :meth:`torch.nn.Module.load_state_dict`.
|
53 |
+
Default value for ``strict`` is set to ``False`` and the message for
|
54 |
+
param mismatch will be shown even if strict is False.
|
55 |
+
|
56 |
+
Args:
|
57 |
+
module (Module): Module that receives the state_dict.
|
58 |
+
state_dict (OrderedDict): Weights.
|
59 |
+
strict (bool): whether to strictly enforce that the keys
|
60 |
+
in :attr:`state_dict` match the keys returned by this module's
|
61 |
+
:meth:`~torch.nn.Module.state_dict` function. Default: ``False``.
|
62 |
+
logger (:obj:`logging.Logger`, optional): Logger to log the error
|
63 |
+
message. If not specified, print function will be used.
|
64 |
+
"""
|
65 |
+
unexpected_keys = []
|
66 |
+
all_missing_keys = []
|
67 |
+
err_msg = []
|
68 |
+
|
69 |
+
metadata = getattr(state_dict, '_metadata', None)
|
70 |
+
state_dict = state_dict.copy()
|
71 |
+
if metadata is not None:
|
72 |
+
state_dict._metadata = metadata
|
73 |
+
|
74 |
+
# use _load_from_state_dict to enable checkpoint version control
|
75 |
+
def load(module, prefix=''):
|
76 |
+
# recursively check parallel module in case that the model has a
|
77 |
+
# complicated structure, e.g., nn.Module(nn.Module(DDP))
|
78 |
+
if is_module_wrapper(module):
|
79 |
+
module = module.module
|
80 |
+
local_metadata = {} if metadata is None else metadata.get(
|
81 |
+
prefix[:-1], {})
|
82 |
+
module._load_from_state_dict(state_dict, prefix, local_metadata, True,
|
83 |
+
all_missing_keys, unexpected_keys,
|
84 |
+
err_msg)
|
85 |
+
for name, child in module._modules.items():
|
86 |
+
if child is not None:
|
87 |
+
load(child, prefix + name + '.')
|
88 |
+
|
89 |
+
load(module)
|
90 |
+
load = None # break load->load reference cycle
|
91 |
+
|
92 |
+
# ignore "num_batches_tracked" of BN layers
|
93 |
+
missing_keys = [
|
94 |
+
key for key in all_missing_keys if 'num_batches_tracked' not in key
|
95 |
+
]
|
96 |
+
|
97 |
+
if unexpected_keys:
|
98 |
+
err_msg.append('unexpected key in source '
|
99 |
+
f'state_dict: {", ".join(unexpected_keys)}\n')
|
100 |
+
if missing_keys:
|
101 |
+
err_msg.append(
|
102 |
+
f'missing keys in source state_dict: {", ".join(missing_keys)}\n')
|
103 |
+
|
104 |
+
rank, _ = get_dist_info()
|
105 |
+
if len(err_msg) > 0 and rank == 0:
|
106 |
+
err_msg.insert(
|
107 |
+
0, 'The model and loaded state dict do not match exactly\n')
|
108 |
+
err_msg = '\n'.join(err_msg)
|
109 |
+
if strict:
|
110 |
+
raise RuntimeError(err_msg)
|
111 |
+
elif logger is not None:
|
112 |
+
logger.warning(err_msg)
|
113 |
+
else:
|
114 |
+
print(err_msg)
|
115 |
+
|
116 |
+
|
117 |
+
def load_url_dist(url, model_dir=None):
|
118 |
+
"""In distributed setting, this function only download checkpoint at local
|
119 |
+
rank 0."""
|
120 |
+
rank, world_size = get_dist_info()
|
121 |
+
rank = int(os.environ.get('LOCAL_RANK', rank))
|
122 |
+
if rank == 0:
|
123 |
+
checkpoint = model_zoo.load_url(url, model_dir=model_dir)
|
124 |
+
if world_size > 1:
|
125 |
+
torch.distributed.barrier()
|
126 |
+
if rank > 0:
|
127 |
+
checkpoint = model_zoo.load_url(url, model_dir=model_dir)
|
128 |
+
return checkpoint
|
129 |
+
|
130 |
+
|
131 |
+
def load_pavimodel_dist(model_path, map_location=None):
|
132 |
+
"""In distributed setting, this function only download checkpoint at local
|
133 |
+
rank 0."""
|
134 |
+
try:
|
135 |
+
from pavi import modelcloud
|
136 |
+
except ImportError:
|
137 |
+
raise ImportError(
|
138 |
+
'Please install pavi to load checkpoint from modelcloud.')
|
139 |
+
rank, world_size = get_dist_info()
|
140 |
+
rank = int(os.environ.get('LOCAL_RANK', rank))
|
141 |
+
if rank == 0:
|
142 |
+
model = modelcloud.get(model_path)
|
143 |
+
with TemporaryDirectory() as tmp_dir:
|
144 |
+
downloaded_file = osp.join(tmp_dir, model.name)
|
145 |
+
model.download(downloaded_file)
|
146 |
+
checkpoint = torch.load(downloaded_file, map_location=map_location)
|
147 |
+
if world_size > 1:
|
148 |
+
torch.distributed.barrier()
|
149 |
+
if rank > 0:
|
150 |
+
model = modelcloud.get(model_path)
|
151 |
+
with TemporaryDirectory() as tmp_dir:
|
152 |
+
downloaded_file = osp.join(tmp_dir, model.name)
|
153 |
+
model.download(downloaded_file)
|
154 |
+
checkpoint = torch.load(
|
155 |
+
downloaded_file, map_location=map_location)
|
156 |
+
return checkpoint
|
157 |
+
|
158 |
+
|
159 |
+
def load_fileclient_dist(filename, backend, map_location):
|
160 |
+
"""In distributed setting, this function only download checkpoint at local
|
161 |
+
rank 0."""
|
162 |
+
rank, world_size = get_dist_info()
|
163 |
+
rank = int(os.environ.get('LOCAL_RANK', rank))
|
164 |
+
allowed_backends = ['ceph']
|
165 |
+
if backend not in allowed_backends:
|
166 |
+
raise ValueError(f'Load from Backend {backend} is not supported.')
|
167 |
+
if rank == 0:
|
168 |
+
fileclient = FileClient(backend=backend)
|
169 |
+
buffer = io.BytesIO(fileclient.get(filename))
|
170 |
+
checkpoint = torch.load(buffer, map_location=map_location)
|
171 |
+
if world_size > 1:
|
172 |
+
torch.distributed.barrier()
|
173 |
+
if rank > 0:
|
174 |
+
fileclient = FileClient(backend=backend)
|
175 |
+
buffer = io.BytesIO(fileclient.get(filename))
|
176 |
+
checkpoint = torch.load(buffer, map_location=map_location)
|
177 |
+
return checkpoint
|
178 |
+
|
179 |
+
|
180 |
+
def get_torchvision_models():
|
181 |
+
model_urls = dict()
|
182 |
+
for _, name, ispkg in pkgutil.walk_packages(torchvision.models.__path__):
|
183 |
+
if ispkg:
|
184 |
+
continue
|
185 |
+
_zoo = import_module(f'torchvision.models.{name}')
|
186 |
+
if hasattr(_zoo, 'model_urls'):
|
187 |
+
_urls = getattr(_zoo, 'model_urls')
|
188 |
+
model_urls.update(_urls)
|
189 |
+
return model_urls
|
190 |
+
|
191 |
+
|
192 |
+
def get_external_models():
|
193 |
+
mmcv_home = _get_mmcv_home()
|
194 |
+
default_json_path = osp.join(mmcv.__path__[0], 'model_zoo/open_mmlab.json')
|
195 |
+
default_urls = load_file(default_json_path)
|
196 |
+
assert isinstance(default_urls, dict)
|
197 |
+
external_json_path = osp.join(mmcv_home, 'open_mmlab.json')
|
198 |
+
if osp.exists(external_json_path):
|
199 |
+
external_urls = load_file(external_json_path)
|
200 |
+
assert isinstance(external_urls, dict)
|
201 |
+
default_urls.update(external_urls)
|
202 |
+
|
203 |
+
return default_urls
|
204 |
+
|
205 |
+
|
206 |
+
def get_mmcls_models():
|
207 |
+
mmcls_json_path = osp.join(mmcv.__path__[0], 'model_zoo/mmcls.json')
|
208 |
+
mmcls_urls = load_file(mmcls_json_path)
|
209 |
+
|
210 |
+
return mmcls_urls
|
211 |
+
|
212 |
+
|
213 |
+
def get_deprecated_model_names():
|
214 |
+
deprecate_json_path = osp.join(mmcv.__path__[0],
|
215 |
+
'model_zoo/deprecated.json')
|
216 |
+
deprecate_urls = load_file(deprecate_json_path)
|
217 |
+
assert isinstance(deprecate_urls, dict)
|
218 |
+
|
219 |
+
return deprecate_urls
|
220 |
+
|
221 |
+
|
222 |
+
def _process_mmcls_checkpoint(checkpoint):
|
223 |
+
state_dict = checkpoint['state_dict']
|
224 |
+
new_state_dict = OrderedDict()
|
225 |
+
for k, v in state_dict.items():
|
226 |
+
if k.startswith('backbone.'):
|
227 |
+
new_state_dict[k[9:]] = v
|
228 |
+
new_checkpoint = dict(state_dict=new_state_dict)
|
229 |
+
|
230 |
+
return new_checkpoint
|
231 |
+
|
232 |
+
|
233 |
+
def _load_checkpoint(filename, map_location=None):
|
234 |
+
"""Load checkpoint from somewhere (modelzoo, file, url).
|
235 |
+
|
236 |
+
Args:
|
237 |
+
filename (str): Accept local filepath, URL, ``torchvision://xxx``,
|
238 |
+
``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
|
239 |
+
details.
|
240 |
+
map_location (str | None): Same as :func:`torch.load`. Default: None.
|
241 |
+
|
242 |
+
Returns:
|
243 |
+
dict | OrderedDict: The loaded checkpoint. It can be either an
|
244 |
+
OrderedDict storing model weights or a dict containing other
|
245 |
+
information, which depends on the checkpoint.
|
246 |
+
"""
|
247 |
+
if filename.startswith('modelzoo://'):
|
248 |
+
warnings.warn('The URL scheme of "modelzoo://" is deprecated, please '
|
249 |
+
'use "torchvision://" instead')
|
250 |
+
model_urls = get_torchvision_models()
|
251 |
+
model_name = filename[11:]
|
252 |
+
checkpoint = load_url_dist(model_urls[model_name])
|
253 |
+
elif filename.startswith('torchvision://'):
|
254 |
+
model_urls = get_torchvision_models()
|
255 |
+
model_name = filename[14:]
|
256 |
+
checkpoint = load_url_dist(model_urls[model_name])
|
257 |
+
elif filename.startswith('open-mmlab://'):
|
258 |
+
model_urls = get_external_models()
|
259 |
+
model_name = filename[13:]
|
260 |
+
deprecated_urls = get_deprecated_model_names()
|
261 |
+
if model_name in deprecated_urls:
|
262 |
+
warnings.warn(f'open-mmlab://{model_name} is deprecated in favor '
|
263 |
+
f'of open-mmlab://{deprecated_urls[model_name]}')
|
264 |
+
model_name = deprecated_urls[model_name]
|
265 |
+
model_url = model_urls[model_name]
|
266 |
+
# check if is url
|
267 |
+
if model_url.startswith(('http://', 'https://')):
|
268 |
+
checkpoint = load_url_dist(model_url)
|
269 |
+
else:
|
270 |
+
filename = osp.join(_get_mmcv_home(), model_url)
|
271 |
+
if not osp.isfile(filename):
|
272 |
+
raise IOError(f'{filename} is not a checkpoint file')
|
273 |
+
checkpoint = torch.load(filename, map_location=map_location)
|
274 |
+
elif filename.startswith('mmcls://'):
|
275 |
+
model_urls = get_mmcls_models()
|
276 |
+
model_name = filename[8:]
|
277 |
+
checkpoint = load_url_dist(model_urls[model_name])
|
278 |
+
checkpoint = _process_mmcls_checkpoint(checkpoint)
|
279 |
+
elif filename.startswith(('http://', 'https://')):
|
280 |
+
checkpoint = load_url_dist(filename)
|
281 |
+
elif filename.startswith('pavi://'):
|
282 |
+
model_path = filename[7:]
|
283 |
+
checkpoint = load_pavimodel_dist(model_path, map_location=map_location)
|
284 |
+
elif filename.startswith('s3://'):
|
285 |
+
checkpoint = load_fileclient_dist(
|
286 |
+
filename, backend='ceph', map_location=map_location)
|
287 |
+
else:
|
288 |
+
if not osp.isfile(filename):
|
289 |
+
raise IOError(f'{filename} is not a checkpoint file')
|
290 |
+
checkpoint = torch.load(filename, map_location=map_location)
|
291 |
+
return checkpoint
|
292 |
+
|
293 |
+
|
294 |
+
def load_checkpoint(model,
|
295 |
+
filename,
|
296 |
+
map_location='cpu',
|
297 |
+
strict=False,
|
298 |
+
logger=None):
|
299 |
+
"""Load checkpoint from a file or URI.
|
300 |
+
|
301 |
+
Args:
|
302 |
+
model (Module): Module to load checkpoint.
|
303 |
+
filename (str): Accept local filepath, URL, ``torchvision://xxx``,
|
304 |
+
``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
|
305 |
+
details.
|
306 |
+
map_location (str): Same as :func:`torch.load`.
|
307 |
+
strict (bool): Whether to allow different params for the model and
|
308 |
+
checkpoint.
|
309 |
+
logger (:mod:`logging.Logger` or None): The logger for error message.
|
310 |
+
|
311 |
+
Returns:
|
312 |
+
dict or OrderedDict: The loaded checkpoint.
|
313 |
+
"""
|
314 |
+
checkpoint = _load_checkpoint(filename, map_location)
|
315 |
+
# OrderedDict is a subclass of dict
|
316 |
+
if not isinstance(checkpoint, dict):
|
317 |
+
raise RuntimeError(
|
318 |
+
f'No state_dict found in checkpoint file {filename}')
|
319 |
+
# get state_dict from checkpoint
|
320 |
+
if 'state_dict' in checkpoint:
|
321 |
+
state_dict = checkpoint['state_dict']
|
322 |
+
elif 'model' in checkpoint:
|
323 |
+
state_dict = checkpoint['model']
|
324 |
+
else:
|
325 |
+
state_dict = checkpoint
|
326 |
+
# strip prefix of state_dict
|
327 |
+
if list(state_dict.keys())[0].startswith('module.'):
|
328 |
+
state_dict = {k[7:]: v for k, v in state_dict.items()}
|
329 |
+
|
330 |
+
# for MoBY, load model of online branch
|
331 |
+
if sorted(list(state_dict.keys()))[0].startswith('encoder'):
|
332 |
+
state_dict = {k.replace('encoder.', ''): v for k, v in state_dict.items() if k.startswith('encoder.')}
|
333 |
+
|
334 |
+
# reshape absolute position embedding
|
335 |
+
if state_dict.get('absolute_pos_embed') is not None:
|
336 |
+
absolute_pos_embed = state_dict['absolute_pos_embed']
|
337 |
+
N1, L, C1 = absolute_pos_embed.size()
|
338 |
+
N2, C2, H, W = model.absolute_pos_embed.size()
|
339 |
+
if N1 != N2 or C1 != C2 or L != H*W:
|
340 |
+
logger.warning("Error in loading absolute_pos_embed, pass")
|
341 |
+
else:
|
342 |
+
state_dict['absolute_pos_embed'] = absolute_pos_embed.view(N2, H, W, C2).permute(0, 3, 1, 2)
|
343 |
+
|
344 |
+
# interpolate position bias table if needed
|
345 |
+
relative_position_bias_table_keys = [k for k in state_dict.keys() if "relative_position_bias_table" in k]
|
346 |
+
for table_key in relative_position_bias_table_keys:
|
347 |
+
table_pretrained = state_dict[table_key]
|
348 |
+
table_current = model.state_dict()[table_key]
|
349 |
+
L1, nH1 = table_pretrained.size()
|
350 |
+
L2, nH2 = table_current.size()
|
351 |
+
if nH1 != nH2:
|
352 |
+
logger.warning(f"Error in loading {table_key}, pass")
|
353 |
+
else:
|
354 |
+
if L1 != L2:
|
355 |
+
S1 = int(L1 ** 0.5)
|
356 |
+
S2 = int(L2 ** 0.5)
|
357 |
+
table_pretrained_resized = F.interpolate(
|
358 |
+
table_pretrained.permute(1, 0).view(1, nH1, S1, S1),
|
359 |
+
size=(S2, S2), mode='bicubic')
|
360 |
+
state_dict[table_key] = table_pretrained_resized.view(nH2, L2).permute(1, 0)
|
361 |
+
|
362 |
+
# load state_dict
|
363 |
+
load_state_dict(model, state_dict, strict, logger)
|
364 |
+
return checkpoint
|
365 |
+
|
366 |
+
|
367 |
+
def weights_to_cpu(state_dict):
|
368 |
+
"""Copy a model state_dict to cpu.
|
369 |
+
|
370 |
+
Args:
|
371 |
+
state_dict (OrderedDict): Model weights on GPU.
|
372 |
+
|
373 |
+
Returns:
|
374 |
+
OrderedDict: Model weights on GPU.
|
375 |
+
"""
|
376 |
+
state_dict_cpu = OrderedDict()
|
377 |
+
for key, val in state_dict.items():
|
378 |
+
state_dict_cpu[key] = val.cpu()
|
379 |
+
return state_dict_cpu
|
380 |
+
|
381 |
+
|
382 |
+
def _save_to_state_dict(module, destination, prefix, keep_vars):
|
383 |
+
"""Saves module state to `destination` dictionary.
|
384 |
+
|
385 |
+
This method is modified from :meth:`torch.nn.Module._save_to_state_dict`.
|
386 |
+
|
387 |
+
Args:
|
388 |
+
module (nn.Module): The module to generate state_dict.
|
389 |
+
destination (dict): A dict where state will be stored.
|
390 |
+
prefix (str): The prefix for parameters and buffers used in this
|
391 |
+
module.
|
392 |
+
"""
|
393 |
+
for name, param in module._parameters.items():
|
394 |
+
if param is not None:
|
395 |
+
destination[prefix + name] = param if keep_vars else param.detach()
|
396 |
+
for name, buf in module._buffers.items():
|
397 |
+
# remove check of _non_persistent_buffers_set to allow nn.BatchNorm2d
|
398 |
+
if buf is not None:
|
399 |
+
destination[prefix + name] = buf if keep_vars else buf.detach()
|
400 |
+
|
401 |
+
|
402 |
+
def get_state_dict(module, destination=None, prefix='', keep_vars=False):
|
403 |
+
"""Returns a dictionary containing a whole state of the module.
|
404 |
+
|
405 |
+
Both parameters and persistent buffers (e.g. running averages) are
|
406 |
+
included. Keys are corresponding parameter and buffer names.
|
407 |
+
|
408 |
+
This method is modified from :meth:`torch.nn.Module.state_dict` to
|
409 |
+
recursively check parallel module in case that the model has a complicated
|
410 |
+
structure, e.g., nn.Module(nn.Module(DDP)).
|
411 |
+
|
412 |
+
Args:
|
413 |
+
module (nn.Module): The module to generate state_dict.
|
414 |
+
destination (OrderedDict): Returned dict for the state of the
|
415 |
+
module.
|
416 |
+
prefix (str): Prefix of the key.
|
417 |
+
keep_vars (bool): Whether to keep the variable property of the
|
418 |
+
parameters. Default: False.
|
419 |
+
|
420 |
+
Returns:
|
421 |
+
dict: A dictionary containing a whole state of the module.
|
422 |
+
"""
|
423 |
+
# recursively check parallel module in case that the model has a
|
424 |
+
# complicated structure, e.g., nn.Module(nn.Module(DDP))
|
425 |
+
if is_module_wrapper(module):
|
426 |
+
module = module.module
|
427 |
+
|
428 |
+
# below is the same as torch.nn.Module.state_dict()
|
429 |
+
if destination is None:
|
430 |
+
destination = OrderedDict()
|
431 |
+
destination._metadata = OrderedDict()
|
432 |
+
destination._metadata[prefix[:-1]] = local_metadata = dict(
|
433 |
+
version=module._version)
|
434 |
+
_save_to_state_dict(module, destination, prefix, keep_vars)
|
435 |
+
for name, child in module._modules.items():
|
436 |
+
if child is not None:
|
437 |
+
get_state_dict(
|
438 |
+
child, destination, prefix + name + '.', keep_vars=keep_vars)
|
439 |
+
for hook in module._state_dict_hooks.values():
|
440 |
+
hook_result = hook(module, destination, prefix, local_metadata)
|
441 |
+
if hook_result is not None:
|
442 |
+
destination = hook_result
|
443 |
+
return destination
|
444 |
+
|
445 |
+
|
446 |
+
def save_checkpoint(model, filename, optimizer=None, meta=None):
|
447 |
+
"""Save checkpoint to file.
|
448 |
+
|
449 |
+
The checkpoint will have 3 fields: ``meta``, ``state_dict`` and
|
450 |
+
``optimizer``. By default ``meta`` will contain version and time info.
|
451 |
+
|
452 |
+
Args:
|
453 |
+
model (Module): Module whose params are to be saved.
|
454 |
+
filename (str): Checkpoint filename.
|
455 |
+
optimizer (:obj:`Optimizer`, optional): Optimizer to be saved.
|
456 |
+
meta (dict, optional): Metadata to be saved in checkpoint.
|
457 |
+
"""
|
458 |
+
if meta is None:
|
459 |
+
meta = {}
|
460 |
+
elif not isinstance(meta, dict):
|
461 |
+
raise TypeError(f'meta must be a dict or None, but got {type(meta)}')
|
462 |
+
meta.update(mmcv_version=mmcv.__version__, time=time.asctime())
|
463 |
+
|
464 |
+
if is_module_wrapper(model):
|
465 |
+
model = model.module
|
466 |
+
|
467 |
+
if hasattr(model, 'CLASSES') and model.CLASSES is not None:
|
468 |
+
# save class name to the meta
|
469 |
+
meta.update(CLASSES=model.CLASSES)
|
470 |
+
|
471 |
+
checkpoint = {
|
472 |
+
'meta': meta,
|
473 |
+
'state_dict': weights_to_cpu(get_state_dict(model))
|
474 |
+
}
|
475 |
+
# save optimizer state dict in the checkpoint
|
476 |
+
if isinstance(optimizer, Optimizer):
|
477 |
+
checkpoint['optimizer'] = optimizer.state_dict()
|
478 |
+
elif isinstance(optimizer, dict):
|
479 |
+
checkpoint['optimizer'] = {}
|
480 |
+
for name, optim in optimizer.items():
|
481 |
+
checkpoint['optimizer'][name] = optim.state_dict()
|
482 |
+
|
483 |
+
if filename.startswith('pavi://'):
|
484 |
+
try:
|
485 |
+
from pavi import modelcloud
|
486 |
+
from pavi.exception import NodeNotFoundError
|
487 |
+
except ImportError:
|
488 |
+
raise ImportError(
|
489 |
+
'Please install pavi to load checkpoint from modelcloud.')
|
490 |
+
model_path = filename[7:]
|
491 |
+
root = modelcloud.Folder()
|
492 |
+
model_dir, model_name = osp.split(model_path)
|
493 |
+
try:
|
494 |
+
model = modelcloud.get(model_dir)
|
495 |
+
except NodeNotFoundError:
|
496 |
+
model = root.create_training_model(model_dir)
|
497 |
+
with TemporaryDirectory() as tmp_dir:
|
498 |
+
checkpoint_file = osp.join(tmp_dir, model_name)
|
499 |
+
with open(checkpoint_file, 'wb') as f:
|
500 |
+
torch.save(checkpoint, f)
|
501 |
+
f.flush()
|
502 |
+
model.create_file(checkpoint_file, name=model_name)
|
503 |
+
else:
|
504 |
+
mmcv.mkdir_or_exist(osp.dirname(filename))
|
505 |
+
# immediately flush buffer
|
506 |
+
with open(filename, 'wb') as f:
|
507 |
+
torch.save(checkpoint, f)
|
508 |
+
f.flush()
|
extensions/microsoftexcel-controlnet/annotator/uniformer/uniformer.py
ADDED
@@ -0,0 +1,426 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# --------------------------------------------------------
|
2 |
+
# UniFormer
|
3 |
+
# Copyright (c) 2022 SenseTime X-Lab
|
4 |
+
# Licensed under The MIT License [see LICENSE for details]
|
5 |
+
# Written by Kunchang Li
|
6 |
+
# --------------------------------------------------------
|
7 |
+
|
8 |
+
|
9 |
+
import torch
|
10 |
+
import torch.nn as nn
|
11 |
+
import torch.nn.functional as F
|
12 |
+
import torch.utils.checkpoint as checkpoint
|
13 |
+
|
14 |
+
from functools import partial
|
15 |
+
from collections import OrderedDict
|
16 |
+
from timm.models.layers import DropPath, to_2tuple, trunc_normal_
|
17 |
+
|
18 |
+
try:
|
19 |
+
from mmseg.utils import get_root_logger
|
20 |
+
from mmseg.models.builder import BACKBONES
|
21 |
+
except ImportError:
|
22 |
+
from annotator.mmpkg.mmseg.utils import get_root_logger
|
23 |
+
from annotator.mmpkg.mmseg.models.builder import BACKBONES
|
24 |
+
|
25 |
+
from annotator.uniformer.mmcv_custom import load_checkpoint
|
26 |
+
|
27 |
+
|
28 |
+
class Mlp(nn.Module):
|
29 |
+
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
|
30 |
+
super().__init__()
|
31 |
+
out_features = out_features or in_features
|
32 |
+
hidden_features = hidden_features or in_features
|
33 |
+
self.fc1 = nn.Linear(in_features, hidden_features)
|
34 |
+
self.act = act_layer()
|
35 |
+
self.fc2 = nn.Linear(hidden_features, out_features)
|
36 |
+
self.drop = nn.Dropout(drop)
|
37 |
+
|
38 |
+
def forward(self, x):
|
39 |
+
x = self.fc1(x)
|
40 |
+
x = self.act(x)
|
41 |
+
x = self.drop(x)
|
42 |
+
x = self.fc2(x)
|
43 |
+
x = self.drop(x)
|
44 |
+
return x
|
45 |
+
|
46 |
+
|
47 |
+
class CMlp(nn.Module):
|
48 |
+
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
|
49 |
+
super().__init__()
|
50 |
+
out_features = out_features or in_features
|
51 |
+
hidden_features = hidden_features or in_features
|
52 |
+
self.fc1 = nn.Conv2d(in_features, hidden_features, 1)
|
53 |
+
self.act = act_layer()
|
54 |
+
self.fc2 = nn.Conv2d(hidden_features, out_features, 1)
|
55 |
+
self.drop = nn.Dropout(drop)
|
56 |
+
|
57 |
+
def forward(self, x):
|
58 |
+
x = self.fc1(x)
|
59 |
+
x = self.act(x)
|
60 |
+
x = self.drop(x)
|
61 |
+
x = self.fc2(x)
|
62 |
+
x = self.drop(x)
|
63 |
+
return x
|
64 |
+
|
65 |
+
|
66 |
+
class CBlock(nn.Module):
|
67 |
+
def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
|
68 |
+
drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
|
69 |
+
super().__init__()
|
70 |
+
self.pos_embed = nn.Conv2d(dim, dim, 3, padding=1, groups=dim)
|
71 |
+
self.norm1 = nn.BatchNorm2d(dim)
|
72 |
+
self.conv1 = nn.Conv2d(dim, dim, 1)
|
73 |
+
self.conv2 = nn.Conv2d(dim, dim, 1)
|
74 |
+
self.attn = nn.Conv2d(dim, dim, 5, padding=2, groups=dim)
|
75 |
+
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
|
76 |
+
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
|
77 |
+
self.norm2 = nn.BatchNorm2d(dim)
|
78 |
+
mlp_hidden_dim = int(dim * mlp_ratio)
|
79 |
+
self.mlp = CMlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
|
80 |
+
|
81 |
+
def forward(self, x):
|
82 |
+
x = x + self.pos_embed(x)
|
83 |
+
x = x + self.drop_path(self.conv2(self.attn(self.conv1(self.norm1(x)))))
|
84 |
+
x = x + self.drop_path(self.mlp(self.norm2(x)))
|
85 |
+
return x
|
86 |
+
|
87 |
+
|
88 |
+
class Attention(nn.Module):
|
89 |
+
def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
|
90 |
+
super().__init__()
|
91 |
+
self.num_heads = num_heads
|
92 |
+
head_dim = dim // num_heads
|
93 |
+
# NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
|
94 |
+
self.scale = qk_scale or head_dim ** -0.5
|
95 |
+
|
96 |
+
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
|
97 |
+
self.attn_drop = nn.Dropout(attn_drop)
|
98 |
+
self.proj = nn.Linear(dim, dim)
|
99 |
+
self.proj_drop = nn.Dropout(proj_drop)
|
100 |
+
|
101 |
+
def forward(self, x):
|
102 |
+
B, N, C = x.shape
|
103 |
+
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
|
104 |
+
q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
|
105 |
+
|
106 |
+
attn = (q @ k.transpose(-2, -1)) * self.scale
|
107 |
+
attn = attn.softmax(dim=-1)
|
108 |
+
attn = self.attn_drop(attn)
|
109 |
+
|
110 |
+
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
|
111 |
+
x = self.proj(x)
|
112 |
+
x = self.proj_drop(x)
|
113 |
+
return x
|
114 |
+
|
115 |
+
|
116 |
+
class SABlock(nn.Module):
|
117 |
+
def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
|
118 |
+
drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
|
119 |
+
super().__init__()
|
120 |
+
self.pos_embed = nn.Conv2d(dim, dim, 3, padding=1, groups=dim)
|
121 |
+
self.norm1 = norm_layer(dim)
|
122 |
+
self.attn = Attention(
|
123 |
+
dim,
|
124 |
+
num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
|
125 |
+
attn_drop=attn_drop, proj_drop=drop)
|
126 |
+
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
|
127 |
+
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
|
128 |
+
self.norm2 = norm_layer(dim)
|
129 |
+
mlp_hidden_dim = int(dim * mlp_ratio)
|
130 |
+
self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
|
131 |
+
|
132 |
+
def forward(self, x):
|
133 |
+
x = x + self.pos_embed(x)
|
134 |
+
B, N, H, W = x.shape
|
135 |
+
x = x.flatten(2).transpose(1, 2)
|
136 |
+
x = x + self.drop_path(self.attn(self.norm1(x)))
|
137 |
+
x = x + self.drop_path(self.mlp(self.norm2(x)))
|
138 |
+
x = x.transpose(1, 2).reshape(B, N, H, W)
|
139 |
+
return x
|
140 |
+
|
141 |
+
|
142 |
+
def window_partition(x, window_size):
|
143 |
+
"""
|
144 |
+
Args:
|
145 |
+
x: (B, H, W, C)
|
146 |
+
window_size (int): window size
|
147 |
+
Returns:
|
148 |
+
windows: (num_windows*B, window_size, window_size, C)
|
149 |
+
"""
|
150 |
+
B, H, W, C = x.shape
|
151 |
+
x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
|
152 |
+
windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
|
153 |
+
return windows
|
154 |
+
|
155 |
+
|
156 |
+
def window_reverse(windows, window_size, H, W):
|
157 |
+
"""
|
158 |
+
Args:
|
159 |
+
windows: (num_windows*B, window_size, window_size, C)
|
160 |
+
window_size (int): Window size
|
161 |
+
H (int): Height of image
|
162 |
+
W (int): Width of image
|
163 |
+
Returns:
|
164 |
+
x: (B, H, W, C)
|
165 |
+
"""
|
166 |
+
B = int(windows.shape[0] / (H * W / window_size / window_size))
|
167 |
+
x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
|
168 |
+
x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
|
169 |
+
return x
|
170 |
+
|
171 |
+
|
172 |
+
class SABlock_Windows(nn.Module):
|
173 |
+
def __init__(self, dim, num_heads, window_size=14, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
|
174 |
+
drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
|
175 |
+
super().__init__()
|
176 |
+
self.window_size=window_size
|
177 |
+
self.pos_embed = nn.Conv2d(dim, dim, 3, padding=1, groups=dim)
|
178 |
+
self.norm1 = norm_layer(dim)
|
179 |
+
self.attn = Attention(
|
180 |
+
dim,
|
181 |
+
num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
|
182 |
+
attn_drop=attn_drop, proj_drop=drop)
|
183 |
+
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
|
184 |
+
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
|
185 |
+
self.norm2 = norm_layer(dim)
|
186 |
+
mlp_hidden_dim = int(dim * mlp_ratio)
|
187 |
+
self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
|
188 |
+
|
189 |
+
def forward(self, x):
|
190 |
+
x = x + self.pos_embed(x)
|
191 |
+
x = x.permute(0, 2, 3, 1)
|
192 |
+
B, H, W, C = x.shape
|
193 |
+
shortcut = x
|
194 |
+
x = self.norm1(x)
|
195 |
+
|
196 |
+
pad_l = pad_t = 0
|
197 |
+
pad_r = (self.window_size - W % self.window_size) % self.window_size
|
198 |
+
pad_b = (self.window_size - H % self.window_size) % self.window_size
|
199 |
+
x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
|
200 |
+
_, Hp, Wp, _ = x.shape
|
201 |
+
|
202 |
+
x_windows = window_partition(x, self.window_size) # nW*B, window_size, window_size, C
|
203 |
+
x_windows = x_windows.view(-1, self.window_size * self.window_size, C) # nW*B, window_size*window_size, C
|
204 |
+
|
205 |
+
# W-MSA/SW-MSA
|
206 |
+
attn_windows = self.attn(x_windows) # nW*B, window_size*window_size, C
|
207 |
+
|
208 |
+
# merge windows
|
209 |
+
attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
|
210 |
+
x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C
|
211 |
+
|
212 |
+
# reverse cyclic shift
|
213 |
+
if pad_r > 0 or pad_b > 0:
|
214 |
+
x = x[:, :H, :W, :].contiguous()
|
215 |
+
|
216 |
+
x = shortcut + self.drop_path(x)
|
217 |
+
x = x + self.drop_path(self.mlp(self.norm2(x)))
|
218 |
+
x = x.permute(0, 3, 1, 2).reshape(B, C, H, W)
|
219 |
+
return x
|
220 |
+
|
221 |
+
|
222 |
+
class PatchEmbed(nn.Module):
|
223 |
+
""" Image to Patch Embedding
|
224 |
+
"""
|
225 |
+
def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
|
226 |
+
super().__init__()
|
227 |
+
img_size = to_2tuple(img_size)
|
228 |
+
patch_size = to_2tuple(patch_size)
|
229 |
+
num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
|
230 |
+
self.img_size = img_size
|
231 |
+
self.patch_size = patch_size
|
232 |
+
self.num_patches = num_patches
|
233 |
+
self.norm = nn.LayerNorm(embed_dim)
|
234 |
+
self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
|
235 |
+
|
236 |
+
def forward(self, x):
|
237 |
+
B, _, H, W = x.shape
|
238 |
+
x = self.proj(x)
|
239 |
+
B, _, H, W = x.shape
|
240 |
+
x = x.flatten(2).transpose(1, 2)
|
241 |
+
x = self.norm(x)
|
242 |
+
x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
|
243 |
+
return x
|
244 |
+
|
245 |
+
|
246 |
+
@BACKBONES.register_module()
|
247 |
+
class UniFormer(nn.Module):
|
248 |
+
""" Vision Transformer
|
249 |
+
A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale` -
|
250 |
+
https://arxiv.org/abs/2010.11929
|
251 |
+
"""
|
252 |
+
def __init__(self, layers=[3, 4, 8, 3], img_size=224, in_chans=3, num_classes=80, embed_dim=[64, 128, 320, 512],
|
253 |
+
head_dim=64, mlp_ratio=4., qkv_bias=True, qk_scale=None, representation_size=None,
|
254 |
+
drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=partial(nn.LayerNorm, eps=1e-6),
|
255 |
+
pretrained_path=None, use_checkpoint=False, checkpoint_num=[0, 0, 0, 0],
|
256 |
+
windows=False, hybrid=False, window_size=14):
|
257 |
+
"""
|
258 |
+
Args:
|
259 |
+
layer (list): number of block in each layer
|
260 |
+
img_size (int, tuple): input image size
|
261 |
+
in_chans (int): number of input channels
|
262 |
+
num_classes (int): number of classes for classification head
|
263 |
+
embed_dim (int): embedding dimension
|
264 |
+
head_dim (int): dimension of attention heads
|
265 |
+
mlp_ratio (int): ratio of mlp hidden dim to embedding dim
|
266 |
+
qkv_bias (bool): enable bias for qkv if True
|
267 |
+
qk_scale (float): override default qk scale of head_dim ** -0.5 if set
|
268 |
+
representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set
|
269 |
+
drop_rate (float): dropout rate
|
270 |
+
attn_drop_rate (float): attention dropout rate
|
271 |
+
drop_path_rate (float): stochastic depth rate
|
272 |
+
norm_layer (nn.Module): normalization layer
|
273 |
+
pretrained_path (str): path of pretrained model
|
274 |
+
use_checkpoint (bool): whether use checkpoint
|
275 |
+
checkpoint_num (list): index for using checkpoint in every stage
|
276 |
+
windows (bool): whether use window MHRA
|
277 |
+
hybrid (bool): whether use hybrid MHRA
|
278 |
+
window_size (int): size of window (>14)
|
279 |
+
"""
|
280 |
+
super().__init__()
|
281 |
+
self.num_classes = num_classes
|
282 |
+
self.use_checkpoint = use_checkpoint
|
283 |
+
self.checkpoint_num = checkpoint_num
|
284 |
+
self.windows = windows
|
285 |
+
print(f'Use Checkpoint: {self.use_checkpoint}')
|
286 |
+
print(f'Checkpoint Number: {self.checkpoint_num}')
|
287 |
+
self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
|
288 |
+
norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
|
289 |
+
|
290 |
+
self.patch_embed1 = PatchEmbed(
|
291 |
+
img_size=img_size, patch_size=4, in_chans=in_chans, embed_dim=embed_dim[0])
|
292 |
+
self.patch_embed2 = PatchEmbed(
|
293 |
+
img_size=img_size // 4, patch_size=2, in_chans=embed_dim[0], embed_dim=embed_dim[1])
|
294 |
+
self.patch_embed3 = PatchEmbed(
|
295 |
+
img_size=img_size // 8, patch_size=2, in_chans=embed_dim[1], embed_dim=embed_dim[2])
|
296 |
+
self.patch_embed4 = PatchEmbed(
|
297 |
+
img_size=img_size // 16, patch_size=2, in_chans=embed_dim[2], embed_dim=embed_dim[3])
|
298 |
+
|
299 |
+
self.pos_drop = nn.Dropout(p=drop_rate)
|
300 |
+
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(layers))] # stochastic depth decay rule
|
301 |
+
num_heads = [dim // head_dim for dim in embed_dim]
|
302 |
+
self.blocks1 = nn.ModuleList([
|
303 |
+
CBlock(
|
304 |
+
dim=embed_dim[0], num_heads=num_heads[0], mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
|
305 |
+
drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer)
|
306 |
+
for i in range(layers[0])])
|
307 |
+
self.norm1=norm_layer(embed_dim[0])
|
308 |
+
self.blocks2 = nn.ModuleList([
|
309 |
+
CBlock(
|
310 |
+
dim=embed_dim[1], num_heads=num_heads[1], mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
|
311 |
+
drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]], norm_layer=norm_layer)
|
312 |
+
for i in range(layers[1])])
|
313 |
+
self.norm2 = norm_layer(embed_dim[1])
|
314 |
+
if self.windows:
|
315 |
+
print('Use local window for all blocks in stage3')
|
316 |
+
self.blocks3 = nn.ModuleList([
|
317 |
+
SABlock_Windows(
|
318 |
+
dim=embed_dim[2], num_heads=num_heads[2], window_size=window_size, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
|
319 |
+
drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]+layers[1]], norm_layer=norm_layer)
|
320 |
+
for i in range(layers[2])])
|
321 |
+
elif hybrid:
|
322 |
+
print('Use hybrid window for blocks in stage3')
|
323 |
+
block3 = []
|
324 |
+
for i in range(layers[2]):
|
325 |
+
if (i + 1) % 4 == 0:
|
326 |
+
block3.append(SABlock(
|
327 |
+
dim=embed_dim[2], num_heads=num_heads[2], mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
|
328 |
+
drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]+layers[1]], norm_layer=norm_layer))
|
329 |
+
else:
|
330 |
+
block3.append(SABlock_Windows(
|
331 |
+
dim=embed_dim[2], num_heads=num_heads[2], window_size=window_size, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
|
332 |
+
drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]+layers[1]], norm_layer=norm_layer))
|
333 |
+
self.blocks3 = nn.ModuleList(block3)
|
334 |
+
else:
|
335 |
+
print('Use global window for all blocks in stage3')
|
336 |
+
self.blocks3 = nn.ModuleList([
|
337 |
+
SABlock(
|
338 |
+
dim=embed_dim[2], num_heads=num_heads[2], mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
|
339 |
+
drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]+layers[1]], norm_layer=norm_layer)
|
340 |
+
for i in range(layers[2])])
|
341 |
+
self.norm3 = norm_layer(embed_dim[2])
|
342 |
+
self.blocks4 = nn.ModuleList([
|
343 |
+
SABlock(
|
344 |
+
dim=embed_dim[3], num_heads=num_heads[3], mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
|
345 |
+
drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]+layers[1]+layers[2]], norm_layer=norm_layer)
|
346 |
+
for i in range(layers[3])])
|
347 |
+
self.norm4 = norm_layer(embed_dim[3])
|
348 |
+
|
349 |
+
# Representation layer
|
350 |
+
if representation_size:
|
351 |
+
self.num_features = representation_size
|
352 |
+
self.pre_logits = nn.Sequential(OrderedDict([
|
353 |
+
('fc', nn.Linear(embed_dim, representation_size)),
|
354 |
+
('act', nn.Tanh())
|
355 |
+
]))
|
356 |
+
else:
|
357 |
+
self.pre_logits = nn.Identity()
|
358 |
+
|
359 |
+
self.apply(self._init_weights)
|
360 |
+
self.init_weights(pretrained=pretrained_path)
|
361 |
+
|
362 |
+
def init_weights(self, pretrained):
|
363 |
+
if isinstance(pretrained, str):
|
364 |
+
logger = get_root_logger()
|
365 |
+
load_checkpoint(self, pretrained, map_location='cpu', strict=False, logger=logger)
|
366 |
+
print(f'Load pretrained model from {pretrained}')
|
367 |
+
def _init_weights(self, m):
|
368 |
+
if isinstance(m, nn.Linear):
|
369 |
+
trunc_normal_(m.weight, std=.02)
|
370 |
+
if isinstance(m, nn.Linear) and m.bias is not None:
|
371 |
+
nn.init.constant_(m.bias, 0)
|
372 |
+
elif isinstance(m, nn.LayerNorm):
|
373 |
+
nn.init.constant_(m.bias, 0)
|
374 |
+
nn.init.constant_(m.weight, 1.0)
|
375 |
+
|
376 |
+
@torch.jit.ignore
|
377 |
+
def no_weight_decay(self):
|
378 |
+
return {'pos_embed', 'cls_token'}
|
379 |
+
|
380 |
+
def get_classifier(self):
|
381 |
+
return self.head
|
382 |
+
|
383 |
+
def reset_classifier(self, num_classes, global_pool=''):
|
384 |
+
self.num_classes = num_classes
|
385 |
+
self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
|
386 |
+
|
387 |
+
def forward_features(self, x):
|
388 |
+
out = []
|
389 |
+
x = self.patch_embed1(x)
|
390 |
+
x = self.pos_drop(x)
|
391 |
+
for i, blk in enumerate(self.blocks1):
|
392 |
+
if self.use_checkpoint and i < self.checkpoint_num[0]:
|
393 |
+
x = checkpoint.checkpoint(blk, x)
|
394 |
+
else:
|
395 |
+
x = blk(x)
|
396 |
+
x_out = self.norm1(x.permute(0, 2, 3, 1))
|
397 |
+
out.append(x_out.permute(0, 3, 1, 2).contiguous())
|
398 |
+
x = self.patch_embed2(x)
|
399 |
+
for i, blk in enumerate(self.blocks2):
|
400 |
+
if self.use_checkpoint and i < self.checkpoint_num[1]:
|
401 |
+
x = checkpoint.checkpoint(blk, x)
|
402 |
+
else:
|
403 |
+
x = blk(x)
|
404 |
+
x_out = self.norm2(x.permute(0, 2, 3, 1))
|
405 |
+
out.append(x_out.permute(0, 3, 1, 2).contiguous())
|
406 |
+
x = self.patch_embed3(x)
|
407 |
+
for i, blk in enumerate(self.blocks3):
|
408 |
+
if self.use_checkpoint and i < self.checkpoint_num[2]:
|
409 |
+
x = checkpoint.checkpoint(blk, x)
|
410 |
+
else:
|
411 |
+
x = blk(x)
|
412 |
+
x_out = self.norm3(x.permute(0, 2, 3, 1))
|
413 |
+
out.append(x_out.permute(0, 3, 1, 2).contiguous())
|
414 |
+
x = self.patch_embed4(x)
|
415 |
+
for i, blk in enumerate(self.blocks4):
|
416 |
+
if self.use_checkpoint and i < self.checkpoint_num[3]:
|
417 |
+
x = checkpoint.checkpoint(blk, x)
|
418 |
+
else:
|
419 |
+
x = blk(x)
|
420 |
+
x_out = self.norm4(x.permute(0, 2, 3, 1))
|
421 |
+
out.append(x_out.permute(0, 3, 1, 2).contiguous())
|
422 |
+
return tuple(out)
|
423 |
+
|
424 |
+
def forward(self, x):
|
425 |
+
x = self.forward_features(x)
|
426 |
+
return x
|
extensions/microsoftexcel-controlnet/annotator/uniformer/upernet_global_small.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'configs/_base_/models/upernet_uniformer.py',
|
3 |
+
'configs/_base_/datasets/ade20k.py',
|
4 |
+
'configs/_base_/default_runtime.py',
|
5 |
+
'configs/_base_/schedules/schedule_160k.py'
|
6 |
+
]
|
7 |
+
|
8 |
+
custom_imports = dict(
|
9 |
+
imports=['annotator.uniformer.uniformer'],
|
10 |
+
allow_failed_imports=False
|
11 |
+
)
|
12 |
+
|
13 |
+
model = dict(
|
14 |
+
backbone=dict(
|
15 |
+
type='UniFormer',
|
16 |
+
embed_dim=[64, 128, 320, 512],
|
17 |
+
layers=[3, 4, 8, 3],
|
18 |
+
head_dim=64,
|
19 |
+
drop_path_rate=0.25,
|
20 |
+
windows=False,
|
21 |
+
hybrid=False
|
22 |
+
),
|
23 |
+
decode_head=dict(
|
24 |
+
in_channels=[64, 128, 320, 512],
|
25 |
+
num_classes=150
|
26 |
+
),
|
27 |
+
auxiliary_head=dict(
|
28 |
+
in_channels=320,
|
29 |
+
num_classes=150
|
30 |
+
))
|
31 |
+
|
32 |
+
# AdamW optimizer, no weight decay for position embedding & layer norm in backbone
|
33 |
+
optimizer = dict(_delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01,
|
34 |
+
paramwise_cfg=dict(custom_keys={'absolute_pos_embed': dict(decay_mult=0.),
|
35 |
+
'relative_position_bias_table': dict(decay_mult=0.),
|
36 |
+
'norm': dict(decay_mult=0.)}))
|
37 |
+
|
38 |
+
lr_config = dict(_delete_=True, policy='poly',
|
39 |
+
warmup='linear',
|
40 |
+
warmup_iters=1500,
|
41 |
+
warmup_ratio=1e-6,
|
42 |
+
power=1.0, min_lr=0.0, by_epoch=False)
|
43 |
+
|
44 |
+
data=dict(samples_per_gpu=2)
|
extensions/microsoftexcel-controlnet/annotator/util.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import cv2
|
3 |
+
|
4 |
+
|
5 |
+
def HWC3(x):
|
6 |
+
assert x.dtype == np.uint8
|
7 |
+
if x.ndim == 2:
|
8 |
+
x = x[:, :, None]
|
9 |
+
assert x.ndim == 3
|
10 |
+
H, W, C = x.shape
|
11 |
+
assert C == 1 or C == 3 or C == 4
|
12 |
+
if C == 3:
|
13 |
+
return x
|
14 |
+
if C == 1:
|
15 |
+
return np.concatenate([x, x, x], axis=2)
|
16 |
+
if C == 4:
|
17 |
+
color = x[:, :, 0:3].astype(np.float32)
|
18 |
+
alpha = x[:, :, 3:4].astype(np.float32) / 255.0
|
19 |
+
y = color * alpha + 255.0 * (1.0 - alpha)
|
20 |
+
y = y.clip(0, 255).astype(np.uint8)
|
21 |
+
return y
|
22 |
+
|
23 |
+
|
24 |
+
def make_noise_disk(H, W, C, F):
|
25 |
+
noise = np.random.uniform(low=0, high=1, size=((H // F) + 2, (W // F) + 2, C))
|
26 |
+
noise = cv2.resize(noise, (W + 2 * F, H + 2 * F), interpolation=cv2.INTER_CUBIC)
|
27 |
+
noise = noise[F: F + H, F: F + W]
|
28 |
+
noise -= np.min(noise)
|
29 |
+
noise /= np.max(noise)
|
30 |
+
if C == 1:
|
31 |
+
noise = noise[:, :, None]
|
32 |
+
return noise
|
33 |
+
|
34 |
+
|
35 |
+
def nms(x, t, s):
|
36 |
+
x = cv2.GaussianBlur(x.astype(np.float32), (0, 0), s)
|
37 |
+
|
38 |
+
f1 = np.array([[0, 0, 0], [1, 1, 1], [0, 0, 0]], dtype=np.uint8)
|
39 |
+
f2 = np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]], dtype=np.uint8)
|
40 |
+
f3 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=np.uint8)
|
41 |
+
f4 = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0]], dtype=np.uint8)
|
42 |
+
|
43 |
+
y = np.zeros_like(x)
|
44 |
+
|
45 |
+
for f in [f1, f2, f3, f4]:
|
46 |
+
np.putmask(y, cv2.dilate(x, kernel=f) == x, x)
|
47 |
+
|
48 |
+
z = np.zeros_like(y, dtype=np.uint8)
|
49 |
+
z[y > t] = 255
|
50 |
+
return z
|
51 |
+
|
52 |
+
|
53 |
+
def min_max_norm(x):
|
54 |
+
x -= np.min(x)
|
55 |
+
x /= np.maximum(np.max(x), 1e-5)
|
56 |
+
return x
|
57 |
+
|
58 |
+
|
59 |
+
def safe_step(x, step=2):
|
60 |
+
y = x.astype(np.float32) * float(step + 1)
|
61 |
+
y = y.astype(np.int32).astype(np.float32) / float(step)
|
62 |
+
return y
|
63 |
+
|
64 |
+
|
65 |
+
def img2mask(img, H, W, low=10, high=90):
|
66 |
+
assert img.ndim == 3 or img.ndim == 2
|
67 |
+
assert img.dtype == np.uint8
|
68 |
+
|
69 |
+
if img.ndim == 3:
|
70 |
+
y = img[:, :, random.randrange(0, img.shape[2])]
|
71 |
+
else:
|
72 |
+
y = img
|
73 |
+
|
74 |
+
y = cv2.resize(y, (W, H), interpolation=cv2.INTER_CUBIC)
|
75 |
+
|
76 |
+
if random.uniform(0, 1) < 0.5:
|
77 |
+
y = 255 - y
|
78 |
+
|
79 |
+
return y < np.percentile(y, random.randrange(low, high))
|
extensions/microsoftexcel-controlnet/annotator/zoe/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2022 Intelligent Systems Lab Org
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
extensions/microsoftexcel-controlnet/annotator/zoe/__init__.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import cv2
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
|
6 |
+
from einops import rearrange
|
7 |
+
from .zoedepth.models.zoedepth.zoedepth_v1 import ZoeDepth
|
8 |
+
from .zoedepth.utils.config import get_config
|
9 |
+
from modules import devices
|
10 |
+
from annotator.annotator_path import models_path
|
11 |
+
|
12 |
+
|
13 |
+
class ZoeDetector:
|
14 |
+
model_dir = os.path.join(models_path, "zoedepth")
|
15 |
+
|
16 |
+
def __init__(self):
|
17 |
+
self.model = None
|
18 |
+
self.device = devices.get_device_for("controlnet")
|
19 |
+
|
20 |
+
def load_model(self):
|
21 |
+
remote_model_path = "https://huggingface.co/lllyasviel/Annotators/resolve/main/ZoeD_M12_N.pt"
|
22 |
+
modelpath = os.path.join(self.model_dir, "ZoeD_M12_N.pt")
|
23 |
+
if not os.path.exists(modelpath):
|
24 |
+
from basicsr.utils.download_util import load_file_from_url
|
25 |
+
load_file_from_url(remote_model_path, model_dir=self.model_dir)
|
26 |
+
conf = get_config("zoedepth", "infer")
|
27 |
+
model = ZoeDepth.build_from_config(conf)
|
28 |
+
model.load_state_dict(torch.load(modelpath, map_location=model.device)['model'])
|
29 |
+
model.eval()
|
30 |
+
self.model = model.to(self.device)
|
31 |
+
|
32 |
+
def unload_model(self):
|
33 |
+
if self.model is not None:
|
34 |
+
self.model.cpu()
|
35 |
+
|
36 |
+
def __call__(self, input_image):
|
37 |
+
if self.model is None:
|
38 |
+
self.load_model()
|
39 |
+
self.model.to(self.device)
|
40 |
+
|
41 |
+
assert input_image.ndim == 3
|
42 |
+
image_depth = input_image
|
43 |
+
with torch.no_grad():
|
44 |
+
image_depth = torch.from_numpy(image_depth).float().to(self.device)
|
45 |
+
image_depth = image_depth / 255.0
|
46 |
+
image_depth = rearrange(image_depth, 'h w c -> 1 c h w')
|
47 |
+
depth = self.model.infer(image_depth)
|
48 |
+
|
49 |
+
depth = depth[0, 0].cpu().numpy()
|
50 |
+
|
51 |
+
vmin = np.percentile(depth, 2)
|
52 |
+
vmax = np.percentile(depth, 85)
|
53 |
+
|
54 |
+
depth -= vmin
|
55 |
+
depth /= vmax - vmin
|
56 |
+
depth = 1.0 - depth
|
57 |
+
depth_image = (depth * 255.0).clip(0, 255).astype(np.uint8)
|
58 |
+
|
59 |
+
return depth_image
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/__init__.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
|
3 |
+
# Copyright (c) 2022 Intelligent Systems Lab Org
|
4 |
+
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
# The above copyright notice and this permission notice shall be included in all
|
13 |
+
# copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
22 |
+
|
23 |
+
# File author: Shariq Farooq Bhat
|
24 |
+
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/__init__.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
|
3 |
+
# Copyright (c) 2022 Intelligent Systems Lab Org
|
4 |
+
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
# The above copyright notice and this permission notice shall be included in all
|
13 |
+
# copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
22 |
+
|
23 |
+
# File author: Shariq Farooq Bhat
|
24 |
+
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas.py
ADDED
@@ -0,0 +1,379 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
import os
|
3 |
+
|
4 |
+
# Copyright (c) 2022 Intelligent Systems Lab Org
|
5 |
+
|
6 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
7 |
+
# of this software and associated documentation files (the "Software"), to deal
|
8 |
+
# in the Software without restriction, including without limitation the rights
|
9 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10 |
+
# copies of the Software, and to permit persons to whom the Software is
|
11 |
+
# furnished to do so, subject to the following conditions:
|
12 |
+
|
13 |
+
# The above copyright notice and this permission notice shall be included in all
|
14 |
+
# copies or substantial portions of the Software.
|
15 |
+
|
16 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
22 |
+
# SOFTWARE.
|
23 |
+
|
24 |
+
# File author: Shariq Farooq Bhat
|
25 |
+
|
26 |
+
import torch
|
27 |
+
import torch.nn as nn
|
28 |
+
import numpy as np
|
29 |
+
from torchvision.transforms import Normalize
|
30 |
+
|
31 |
+
|
32 |
+
def denormalize(x):
|
33 |
+
"""Reverses the imagenet normalization applied to the input.
|
34 |
+
|
35 |
+
Args:
|
36 |
+
x (torch.Tensor - shape(N,3,H,W)): input tensor
|
37 |
+
|
38 |
+
Returns:
|
39 |
+
torch.Tensor - shape(N,3,H,W): Denormalized input
|
40 |
+
"""
|
41 |
+
mean = torch.Tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1).to(x.device)
|
42 |
+
std = torch.Tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1).to(x.device)
|
43 |
+
return x * std + mean
|
44 |
+
|
45 |
+
def get_activation(name, bank):
|
46 |
+
def hook(model, input, output):
|
47 |
+
bank[name] = output
|
48 |
+
return hook
|
49 |
+
|
50 |
+
|
51 |
+
class Resize(object):
|
52 |
+
"""Resize sample to given size (width, height).
|
53 |
+
"""
|
54 |
+
|
55 |
+
def __init__(
|
56 |
+
self,
|
57 |
+
width,
|
58 |
+
height,
|
59 |
+
resize_target=True,
|
60 |
+
keep_aspect_ratio=False,
|
61 |
+
ensure_multiple_of=1,
|
62 |
+
resize_method="lower_bound",
|
63 |
+
):
|
64 |
+
"""Init.
|
65 |
+
Args:
|
66 |
+
width (int): desired output width
|
67 |
+
height (int): desired output height
|
68 |
+
resize_target (bool, optional):
|
69 |
+
True: Resize the full sample (image, mask, target).
|
70 |
+
False: Resize image only.
|
71 |
+
Defaults to True.
|
72 |
+
keep_aspect_ratio (bool, optional):
|
73 |
+
True: Keep the aspect ratio of the input sample.
|
74 |
+
Output sample might not have the given width and height, and
|
75 |
+
resize behaviour depends on the parameter 'resize_method'.
|
76 |
+
Defaults to False.
|
77 |
+
ensure_multiple_of (int, optional):
|
78 |
+
Output width and height is constrained to be multiple of this parameter.
|
79 |
+
Defaults to 1.
|
80 |
+
resize_method (str, optional):
|
81 |
+
"lower_bound": Output will be at least as large as the given size.
|
82 |
+
"upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
|
83 |
+
"minimal": Scale as least as possible. (Output size might be smaller than given size.)
|
84 |
+
Defaults to "lower_bound".
|
85 |
+
"""
|
86 |
+
print("Params passed to Resize transform:")
|
87 |
+
print("\twidth: ", width)
|
88 |
+
print("\theight: ", height)
|
89 |
+
print("\tresize_target: ", resize_target)
|
90 |
+
print("\tkeep_aspect_ratio: ", keep_aspect_ratio)
|
91 |
+
print("\tensure_multiple_of: ", ensure_multiple_of)
|
92 |
+
print("\tresize_method: ", resize_method)
|
93 |
+
|
94 |
+
self.__width = width
|
95 |
+
self.__height = height
|
96 |
+
|
97 |
+
self.__keep_aspect_ratio = keep_aspect_ratio
|
98 |
+
self.__multiple_of = ensure_multiple_of
|
99 |
+
self.__resize_method = resize_method
|
100 |
+
|
101 |
+
def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
|
102 |
+
y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
|
103 |
+
|
104 |
+
if max_val is not None and y > max_val:
|
105 |
+
y = (np.floor(x / self.__multiple_of)
|
106 |
+
* self.__multiple_of).astype(int)
|
107 |
+
|
108 |
+
if y < min_val:
|
109 |
+
y = (np.ceil(x / self.__multiple_of)
|
110 |
+
* self.__multiple_of).astype(int)
|
111 |
+
|
112 |
+
return y
|
113 |
+
|
114 |
+
def get_size(self, width, height):
|
115 |
+
# determine new height and width
|
116 |
+
scale_height = self.__height / height
|
117 |
+
scale_width = self.__width / width
|
118 |
+
|
119 |
+
if self.__keep_aspect_ratio:
|
120 |
+
if self.__resize_method == "lower_bound":
|
121 |
+
# scale such that output size is lower bound
|
122 |
+
if scale_width > scale_height:
|
123 |
+
# fit width
|
124 |
+
scale_height = scale_width
|
125 |
+
else:
|
126 |
+
# fit height
|
127 |
+
scale_width = scale_height
|
128 |
+
elif self.__resize_method == "upper_bound":
|
129 |
+
# scale such that output size is upper bound
|
130 |
+
if scale_width < scale_height:
|
131 |
+
# fit width
|
132 |
+
scale_height = scale_width
|
133 |
+
else:
|
134 |
+
# fit height
|
135 |
+
scale_width = scale_height
|
136 |
+
elif self.__resize_method == "minimal":
|
137 |
+
# scale as least as possbile
|
138 |
+
if abs(1 - scale_width) < abs(1 - scale_height):
|
139 |
+
# fit width
|
140 |
+
scale_height = scale_width
|
141 |
+
else:
|
142 |
+
# fit height
|
143 |
+
scale_width = scale_height
|
144 |
+
else:
|
145 |
+
raise ValueError(
|
146 |
+
f"resize_method {self.__resize_method} not implemented"
|
147 |
+
)
|
148 |
+
|
149 |
+
if self.__resize_method == "lower_bound":
|
150 |
+
new_height = self.constrain_to_multiple_of(
|
151 |
+
scale_height * height, min_val=self.__height
|
152 |
+
)
|
153 |
+
new_width = self.constrain_to_multiple_of(
|
154 |
+
scale_width * width, min_val=self.__width
|
155 |
+
)
|
156 |
+
elif self.__resize_method == "upper_bound":
|
157 |
+
new_height = self.constrain_to_multiple_of(
|
158 |
+
scale_height * height, max_val=self.__height
|
159 |
+
)
|
160 |
+
new_width = self.constrain_to_multiple_of(
|
161 |
+
scale_width * width, max_val=self.__width
|
162 |
+
)
|
163 |
+
elif self.__resize_method == "minimal":
|
164 |
+
new_height = self.constrain_to_multiple_of(scale_height * height)
|
165 |
+
new_width = self.constrain_to_multiple_of(scale_width * width)
|
166 |
+
else:
|
167 |
+
raise ValueError(
|
168 |
+
f"resize_method {self.__resize_method} not implemented")
|
169 |
+
|
170 |
+
return (new_width, new_height)
|
171 |
+
|
172 |
+
def __call__(self, x):
|
173 |
+
width, height = self.get_size(*x.shape[-2:][::-1])
|
174 |
+
return nn.functional.interpolate(x, (height, width), mode='bilinear', align_corners=True)
|
175 |
+
|
176 |
+
class PrepForMidas(object):
|
177 |
+
def __init__(self, resize_mode="minimal", keep_aspect_ratio=True, img_size=384, do_resize=True):
|
178 |
+
if isinstance(img_size, int):
|
179 |
+
img_size = (img_size, img_size)
|
180 |
+
net_h, net_w = img_size
|
181 |
+
self.normalization = Normalize(
|
182 |
+
mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
|
183 |
+
self.resizer = Resize(net_w, net_h, keep_aspect_ratio=keep_aspect_ratio, ensure_multiple_of=32, resize_method=resize_mode) \
|
184 |
+
if do_resize else nn.Identity()
|
185 |
+
|
186 |
+
def __call__(self, x):
|
187 |
+
return self.normalization(self.resizer(x))
|
188 |
+
|
189 |
+
|
190 |
+
class MidasCore(nn.Module):
|
191 |
+
def __init__(self, midas, trainable=False, fetch_features=True, layer_names=('out_conv', 'l4_rn', 'r4', 'r3', 'r2', 'r1'), freeze_bn=False, keep_aspect_ratio=True,
|
192 |
+
img_size=384, **kwargs):
|
193 |
+
"""Midas Base model used for multi-scale feature extraction.
|
194 |
+
|
195 |
+
Args:
|
196 |
+
midas (torch.nn.Module): Midas model.
|
197 |
+
trainable (bool, optional): Train midas model. Defaults to False.
|
198 |
+
fetch_features (bool, optional): Extract multi-scale features. Defaults to True.
|
199 |
+
layer_names (tuple, optional): Layers used for feature extraction. Order = (head output features, last layer features, ...decoder features). Defaults to ('out_conv', 'l4_rn', 'r4', 'r3', 'r2', 'r1').
|
200 |
+
freeze_bn (bool, optional): Freeze BatchNorm. Generally results in better finetuning performance. Defaults to False.
|
201 |
+
keep_aspect_ratio (bool, optional): Keep the aspect ratio of input images while resizing. Defaults to True.
|
202 |
+
img_size (int, tuple, optional): Input resolution. Defaults to 384.
|
203 |
+
"""
|
204 |
+
super().__init__()
|
205 |
+
self.core = midas
|
206 |
+
self.output_channels = None
|
207 |
+
self.core_out = {}
|
208 |
+
self.trainable = trainable
|
209 |
+
self.fetch_features = fetch_features
|
210 |
+
# midas.scratch.output_conv = nn.Identity()
|
211 |
+
self.handles = []
|
212 |
+
# self.layer_names = ['out_conv','l4_rn', 'r4', 'r3', 'r2', 'r1']
|
213 |
+
self.layer_names = layer_names
|
214 |
+
|
215 |
+
self.set_trainable(trainable)
|
216 |
+
self.set_fetch_features(fetch_features)
|
217 |
+
|
218 |
+
self.prep = PrepForMidas(keep_aspect_ratio=keep_aspect_ratio,
|
219 |
+
img_size=img_size, do_resize=kwargs.get('do_resize', True))
|
220 |
+
|
221 |
+
if freeze_bn:
|
222 |
+
self.freeze_bn()
|
223 |
+
|
224 |
+
def set_trainable(self, trainable):
|
225 |
+
self.trainable = trainable
|
226 |
+
if trainable:
|
227 |
+
self.unfreeze()
|
228 |
+
else:
|
229 |
+
self.freeze()
|
230 |
+
return self
|
231 |
+
|
232 |
+
def set_fetch_features(self, fetch_features):
|
233 |
+
self.fetch_features = fetch_features
|
234 |
+
if fetch_features:
|
235 |
+
if len(self.handles) == 0:
|
236 |
+
self.attach_hooks(self.core)
|
237 |
+
else:
|
238 |
+
self.remove_hooks()
|
239 |
+
return self
|
240 |
+
|
241 |
+
def freeze(self):
|
242 |
+
for p in self.parameters():
|
243 |
+
p.requires_grad = False
|
244 |
+
self.trainable = False
|
245 |
+
return self
|
246 |
+
|
247 |
+
def unfreeze(self):
|
248 |
+
for p in self.parameters():
|
249 |
+
p.requires_grad = True
|
250 |
+
self.trainable = True
|
251 |
+
return self
|
252 |
+
|
253 |
+
def freeze_bn(self):
|
254 |
+
for m in self.modules():
|
255 |
+
if isinstance(m, nn.BatchNorm2d):
|
256 |
+
m.eval()
|
257 |
+
return self
|
258 |
+
|
259 |
+
def forward(self, x, denorm=False, return_rel_depth=False):
|
260 |
+
with torch.no_grad():
|
261 |
+
if denorm:
|
262 |
+
x = denormalize(x)
|
263 |
+
x = self.prep(x)
|
264 |
+
# print("Shape after prep: ", x.shape)
|
265 |
+
|
266 |
+
with torch.set_grad_enabled(self.trainable):
|
267 |
+
|
268 |
+
# print("Input size to Midascore", x.shape)
|
269 |
+
rel_depth = self.core(x)
|
270 |
+
# print("Output from midas shape", rel_depth.shape)
|
271 |
+
if not self.fetch_features:
|
272 |
+
return rel_depth
|
273 |
+
out = [self.core_out[k] for k in self.layer_names]
|
274 |
+
|
275 |
+
if return_rel_depth:
|
276 |
+
return rel_depth, out
|
277 |
+
return out
|
278 |
+
|
279 |
+
def get_rel_pos_params(self):
|
280 |
+
for name, p in self.core.pretrained.named_parameters():
|
281 |
+
if "relative_position" in name:
|
282 |
+
yield p
|
283 |
+
|
284 |
+
def get_enc_params_except_rel_pos(self):
|
285 |
+
for name, p in self.core.pretrained.named_parameters():
|
286 |
+
if "relative_position" not in name:
|
287 |
+
yield p
|
288 |
+
|
289 |
+
def freeze_encoder(self, freeze_rel_pos=False):
|
290 |
+
if freeze_rel_pos:
|
291 |
+
for p in self.core.pretrained.parameters():
|
292 |
+
p.requires_grad = False
|
293 |
+
else:
|
294 |
+
for p in self.get_enc_params_except_rel_pos():
|
295 |
+
p.requires_grad = False
|
296 |
+
return self
|
297 |
+
|
298 |
+
def attach_hooks(self, midas):
|
299 |
+
if len(self.handles) > 0:
|
300 |
+
self.remove_hooks()
|
301 |
+
if "out_conv" in self.layer_names:
|
302 |
+
self.handles.append(list(midas.scratch.output_conv.children())[
|
303 |
+
3].register_forward_hook(get_activation("out_conv", self.core_out)))
|
304 |
+
if "r4" in self.layer_names:
|
305 |
+
self.handles.append(midas.scratch.refinenet4.register_forward_hook(
|
306 |
+
get_activation("r4", self.core_out)))
|
307 |
+
if "r3" in self.layer_names:
|
308 |
+
self.handles.append(midas.scratch.refinenet3.register_forward_hook(
|
309 |
+
get_activation("r3", self.core_out)))
|
310 |
+
if "r2" in self.layer_names:
|
311 |
+
self.handles.append(midas.scratch.refinenet2.register_forward_hook(
|
312 |
+
get_activation("r2", self.core_out)))
|
313 |
+
if "r1" in self.layer_names:
|
314 |
+
self.handles.append(midas.scratch.refinenet1.register_forward_hook(
|
315 |
+
get_activation("r1", self.core_out)))
|
316 |
+
if "l4_rn" in self.layer_names:
|
317 |
+
self.handles.append(midas.scratch.layer4_rn.register_forward_hook(
|
318 |
+
get_activation("l4_rn", self.core_out)))
|
319 |
+
|
320 |
+
return self
|
321 |
+
|
322 |
+
def remove_hooks(self):
|
323 |
+
for h in self.handles:
|
324 |
+
h.remove()
|
325 |
+
return self
|
326 |
+
|
327 |
+
def __del__(self):
|
328 |
+
self.remove_hooks()
|
329 |
+
|
330 |
+
def set_output_channels(self, model_type):
|
331 |
+
self.output_channels = MIDAS_SETTINGS[model_type]
|
332 |
+
|
333 |
+
@staticmethod
|
334 |
+
def build(midas_model_type="DPT_BEiT_L_384", train_midas=False, use_pretrained_midas=True, fetch_features=False, freeze_bn=True, force_keep_ar=False, force_reload=False, **kwargs):
|
335 |
+
if midas_model_type not in MIDAS_SETTINGS:
|
336 |
+
raise ValueError(
|
337 |
+
f"Invalid model type: {midas_model_type}. Must be one of {list(MIDAS_SETTINGS.keys())}")
|
338 |
+
if "img_size" in kwargs:
|
339 |
+
kwargs = MidasCore.parse_img_size(kwargs)
|
340 |
+
img_size = kwargs.pop("img_size", [384, 384])
|
341 |
+
print("img_size", img_size)
|
342 |
+
midas_path = os.path.join(os.path.dirname(__file__), 'midas_repo')
|
343 |
+
midas = torch.hub.load(midas_path, midas_model_type,
|
344 |
+
pretrained=use_pretrained_midas, force_reload=force_reload, source='local')
|
345 |
+
kwargs.update({'keep_aspect_ratio': force_keep_ar})
|
346 |
+
midas_core = MidasCore(midas, trainable=train_midas, fetch_features=fetch_features,
|
347 |
+
freeze_bn=freeze_bn, img_size=img_size, **kwargs)
|
348 |
+
midas_core.set_output_channels(midas_model_type)
|
349 |
+
return midas_core
|
350 |
+
|
351 |
+
@staticmethod
|
352 |
+
def build_from_config(config):
|
353 |
+
return MidasCore.build(**config)
|
354 |
+
|
355 |
+
@staticmethod
|
356 |
+
def parse_img_size(config):
|
357 |
+
assert 'img_size' in config
|
358 |
+
if isinstance(config['img_size'], str):
|
359 |
+
assert "," in config['img_size'], "img_size should be a string with comma separated img_size=H,W"
|
360 |
+
config['img_size'] = list(map(int, config['img_size'].split(",")))
|
361 |
+
assert len(
|
362 |
+
config['img_size']) == 2, "img_size should be a string with comma separated img_size=H,W"
|
363 |
+
elif isinstance(config['img_size'], int):
|
364 |
+
config['img_size'] = [config['img_size'], config['img_size']]
|
365 |
+
else:
|
366 |
+
assert isinstance(config['img_size'], list) and len(
|
367 |
+
config['img_size']) == 2, "img_size should be a list of H,W"
|
368 |
+
return config
|
369 |
+
|
370 |
+
|
371 |
+
nchannels2models = {
|
372 |
+
tuple([256]*5): ["DPT_BEiT_L_384", "DPT_BEiT_L_512", "DPT_BEiT_B_384", "DPT_SwinV2_L_384", "DPT_SwinV2_B_384", "DPT_SwinV2_T_256", "DPT_Large", "DPT_Hybrid"],
|
373 |
+
(512, 256, 128, 64, 64): ["MiDaS_small"]
|
374 |
+
}
|
375 |
+
|
376 |
+
# Model name to number of output channels
|
377 |
+
MIDAS_SETTINGS = {m: k for k, v in nchannels2models.items()
|
378 |
+
for m in v
|
379 |
+
}
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/.gitignore
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
*.egg-info/
|
24 |
+
.installed.cfg
|
25 |
+
*.egg
|
26 |
+
MANIFEST
|
27 |
+
|
28 |
+
# PyInstaller
|
29 |
+
# Usually these files are written by a python script from a template
|
30 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
31 |
+
*.manifest
|
32 |
+
*.spec
|
33 |
+
|
34 |
+
# Installer logs
|
35 |
+
pip-log.txt
|
36 |
+
pip-delete-this-directory.txt
|
37 |
+
|
38 |
+
# Unit test / coverage reports
|
39 |
+
htmlcov/
|
40 |
+
.tox/
|
41 |
+
.coverage
|
42 |
+
.coverage.*
|
43 |
+
.cache
|
44 |
+
nosetests.xml
|
45 |
+
coverage.xml
|
46 |
+
*.cover
|
47 |
+
.hypothesis/
|
48 |
+
.pytest_cache/
|
49 |
+
|
50 |
+
# Translations
|
51 |
+
*.mo
|
52 |
+
*.pot
|
53 |
+
|
54 |
+
# Django stuff:
|
55 |
+
*.log
|
56 |
+
local_settings.py
|
57 |
+
db.sqlite3
|
58 |
+
|
59 |
+
# Flask stuff:
|
60 |
+
instance/
|
61 |
+
.webassets-cache
|
62 |
+
|
63 |
+
# Scrapy stuff:
|
64 |
+
.scrapy
|
65 |
+
|
66 |
+
# Sphinx documentation
|
67 |
+
docs/_build/
|
68 |
+
|
69 |
+
# PyBuilder
|
70 |
+
target/
|
71 |
+
|
72 |
+
# Jupyter Notebook
|
73 |
+
.ipynb_checkpoints
|
74 |
+
|
75 |
+
# pyenv
|
76 |
+
.python-version
|
77 |
+
|
78 |
+
# celery beat schedule file
|
79 |
+
celerybeat-schedule
|
80 |
+
|
81 |
+
# SageMath parsed files
|
82 |
+
*.sage.py
|
83 |
+
|
84 |
+
# Environments
|
85 |
+
.env
|
86 |
+
.venv
|
87 |
+
env/
|
88 |
+
venv/
|
89 |
+
ENV/
|
90 |
+
env.bak/
|
91 |
+
venv.bak/
|
92 |
+
|
93 |
+
# Spyder project settings
|
94 |
+
.spyderproject
|
95 |
+
.spyproject
|
96 |
+
|
97 |
+
# Rope project settings
|
98 |
+
.ropeproject
|
99 |
+
|
100 |
+
# mkdocs documentation
|
101 |
+
/site
|
102 |
+
|
103 |
+
# mypy
|
104 |
+
.mypy_cache/
|
105 |
+
|
106 |
+
*.png
|
107 |
+
*.pfm
|
108 |
+
*.jpg
|
109 |
+
*.jpeg
|
110 |
+
*.pt
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/Dockerfile
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# enables cuda support in docker
|
2 |
+
FROM nvidia/cuda:10.2-cudnn7-runtime-ubuntu18.04
|
3 |
+
|
4 |
+
# install python 3.6, pip and requirements for opencv-python
|
5 |
+
# (see https://github.com/NVIDIA/nvidia-docker/issues/864)
|
6 |
+
RUN apt-get update && apt-get -y install \
|
7 |
+
python3 \
|
8 |
+
python3-pip \
|
9 |
+
libsm6 \
|
10 |
+
libxext6 \
|
11 |
+
libxrender-dev \
|
12 |
+
curl \
|
13 |
+
&& rm -rf /var/lib/apt/lists/*
|
14 |
+
|
15 |
+
# install python dependencies
|
16 |
+
RUN pip3 install --upgrade pip
|
17 |
+
RUN pip3 install torch~=1.8 torchvision opencv-python-headless~=3.4 timm
|
18 |
+
|
19 |
+
# copy inference code
|
20 |
+
WORKDIR /opt/MiDaS
|
21 |
+
COPY ./midas ./midas
|
22 |
+
COPY ./*.py ./
|
23 |
+
|
24 |
+
# download model weights so the docker image can be used offline
|
25 |
+
RUN cd weights && {curl -OL https://github.com/isl-org/MiDaS/releases/download/v3/dpt_hybrid_384.pt; cd -; }
|
26 |
+
RUN python3 run.py --model_type dpt_hybrid; exit 0
|
27 |
+
|
28 |
+
# entrypoint (dont forget to mount input and output directories)
|
29 |
+
CMD python3 run.py --model_type dpt_hybrid
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2019 Intel ISL (Intel Intelligent Systems Lab)
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/README.md
ADDED
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer
|
2 |
+
|
3 |
+
This repository contains code to compute depth from a single image. It accompanies our [paper](https://arxiv.org/abs/1907.01341v3):
|
4 |
+
|
5 |
+
>Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer
|
6 |
+
René Ranftl, Katrin Lasinger, David Hafner, Konrad Schindler, Vladlen Koltun
|
7 |
+
|
8 |
+
|
9 |
+
and our [preprint](https://arxiv.org/abs/2103.13413):
|
10 |
+
|
11 |
+
> Vision Transformers for Dense Prediction
|
12 |
+
> René Ranftl, Alexey Bochkovskiy, Vladlen Koltun
|
13 |
+
|
14 |
+
|
15 |
+
MiDaS was trained on up to 12 datasets (ReDWeb, DIML, Movies, MegaDepth, WSVD, TartanAir, HRWSI, ApolloScape, BlendedMVS, IRS, KITTI, NYU Depth V2) with
|
16 |
+
multi-objective optimization.
|
17 |
+
The original model that was trained on 5 datasets (`MIX 5` in the paper) can be found [here](https://github.com/isl-org/MiDaS/releases/tag/v2).
|
18 |
+
The figure below shows an overview of the different MiDaS models; the bubble size scales with number of parameters.
|
19 |
+
|
20 |
+
![](figures/Improvement_vs_FPS.png)
|
21 |
+
|
22 |
+
### Setup
|
23 |
+
|
24 |
+
1) Pick one or more models and download the corresponding weights to the `weights` folder:
|
25 |
+
|
26 |
+
MiDaS 3.1
|
27 |
+
- For highest quality: [dpt_beit_large_512](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt)
|
28 |
+
- For moderately less quality, but better speed-performance trade-off: [dpt_swin2_large_384](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_large_384.pt)
|
29 |
+
- For embedded devices: [dpt_swin2_tiny_256](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_tiny_256.pt), [dpt_levit_224](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_levit_224.pt)
|
30 |
+
- For inference on Intel CPUs, OpenVINO may be used for the small legacy model: openvino_midas_v21_small [.xml](https://github.com/isl-org/MiDaS/releases/download/v3_1/openvino_midas_v21_small_256.xml), [.bin](https://github.com/isl-org/MiDaS/releases/download/v3_1/openvino_midas_v21_small_256.bin)
|
31 |
+
|
32 |
+
MiDaS 3.0: Legacy transformer models [dpt_large_384](https://github.com/isl-org/MiDaS/releases/download/v3/dpt_large_384.pt) and [dpt_hybrid_384](https://github.com/isl-org/MiDaS/releases/download/v3/dpt_hybrid_384.pt)
|
33 |
+
|
34 |
+
MiDaS 2.1: Legacy convolutional models [midas_v21_384](https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_384.pt) and [midas_v21_small_256](https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt)
|
35 |
+
|
36 |
+
1) Set up dependencies:
|
37 |
+
|
38 |
+
```shell
|
39 |
+
conda env create -f environment.yaml
|
40 |
+
conda activate midas-py310
|
41 |
+
```
|
42 |
+
|
43 |
+
#### optional
|
44 |
+
|
45 |
+
For the Next-ViT model, execute
|
46 |
+
|
47 |
+
```shell
|
48 |
+
git submodule add https://github.com/isl-org/Next-ViT midas/external/next_vit
|
49 |
+
```
|
50 |
+
|
51 |
+
For the OpenVINO model, install
|
52 |
+
|
53 |
+
```shell
|
54 |
+
pip install openvino
|
55 |
+
```
|
56 |
+
|
57 |
+
### Usage
|
58 |
+
|
59 |
+
1) Place one or more input images in the folder `input`.
|
60 |
+
|
61 |
+
2) Run the model with
|
62 |
+
|
63 |
+
```shell
|
64 |
+
python run.py --model_type <model_type> --input_path input --output_path output
|
65 |
+
```
|
66 |
+
where ```<model_type>``` is chosen from [dpt_beit_large_512](#model_type), [dpt_beit_large_384](#model_type),
|
67 |
+
[dpt_beit_base_384](#model_type), [dpt_swin2_large_384](#model_type), [dpt_swin2_base_384](#model_type),
|
68 |
+
[dpt_swin2_tiny_256](#model_type), [dpt_swin_large_384](#model_type), [dpt_next_vit_large_384](#model_type),
|
69 |
+
[dpt_levit_224](#model_type), [dpt_large_384](#model_type), [dpt_hybrid_384](#model_type),
|
70 |
+
[midas_v21_384](#model_type), [midas_v21_small_256](#model_type), [openvino_midas_v21_small_256](#model_type).
|
71 |
+
|
72 |
+
3) The resulting depth maps are written to the `output` folder.
|
73 |
+
|
74 |
+
#### optional
|
75 |
+
|
76 |
+
1) By default, the inference resizes the height of input images to the size of a model to fit into the encoder. This
|
77 |
+
size is given by the numbers in the model names of the [accuracy table](#accuracy). Some models do not only support a single
|
78 |
+
inference height but a range of different heights. Feel free to explore different heights by appending the extra
|
79 |
+
command line argument `--height`. Unsupported height values will throw an error. Note that using this argument may
|
80 |
+
decrease the model accuracy.
|
81 |
+
2) By default, the inference keeps the aspect ratio of input images when feeding them into the encoder if this is
|
82 |
+
supported by a model (all models except for Swin, Swin2, LeViT). In order to resize to a square resolution,
|
83 |
+
disregarding the aspect ratio while preserving the height, use the command line argument `--square`.
|
84 |
+
|
85 |
+
#### via Camera
|
86 |
+
|
87 |
+
If you want the input images to be grabbed from the camera and shown in a window, leave the input and output paths
|
88 |
+
away and choose a model type as shown above:
|
89 |
+
|
90 |
+
```shell
|
91 |
+
python run.py --model_type <model_type> --side
|
92 |
+
```
|
93 |
+
|
94 |
+
The argument `--side` is optional and causes both the input RGB image and the output depth map to be shown
|
95 |
+
side-by-side for comparison.
|
96 |
+
|
97 |
+
#### via Docker
|
98 |
+
|
99 |
+
1) Make sure you have installed Docker and the
|
100 |
+
[NVIDIA Docker runtime](https://github.com/NVIDIA/nvidia-docker/wiki/Installation-\(Native-GPU-Support\)).
|
101 |
+
|
102 |
+
2) Build the Docker image:
|
103 |
+
|
104 |
+
```shell
|
105 |
+
docker build -t midas .
|
106 |
+
```
|
107 |
+
|
108 |
+
3) Run inference:
|
109 |
+
|
110 |
+
```shell
|
111 |
+
docker run --rm --gpus all -v $PWD/input:/opt/MiDaS/input -v $PWD/output:/opt/MiDaS/output -v $PWD/weights:/opt/MiDaS/weights midas
|
112 |
+
```
|
113 |
+
|
114 |
+
This command passes through all of your NVIDIA GPUs to the container, mounts the
|
115 |
+
`input` and `output` directories and then runs the inference.
|
116 |
+
|
117 |
+
#### via PyTorch Hub
|
118 |
+
|
119 |
+
The pretrained model is also available on [PyTorch Hub](https://pytorch.org/hub/intelisl_midas_v2/)
|
120 |
+
|
121 |
+
#### via TensorFlow or ONNX
|
122 |
+
|
123 |
+
See [README](https://github.com/isl-org/MiDaS/tree/master/tf) in the `tf` subdirectory.
|
124 |
+
|
125 |
+
Currently only supports MiDaS v2.1.
|
126 |
+
|
127 |
+
|
128 |
+
#### via Mobile (iOS / Android)
|
129 |
+
|
130 |
+
See [README](https://github.com/isl-org/MiDaS/tree/master/mobile) in the `mobile` subdirectory.
|
131 |
+
|
132 |
+
#### via ROS1 (Robot Operating System)
|
133 |
+
|
134 |
+
See [README](https://github.com/isl-org/MiDaS/tree/master/ros) in the `ros` subdirectory.
|
135 |
+
|
136 |
+
Currently only supports MiDaS v2.1. DPT-based models to be added.
|
137 |
+
|
138 |
+
|
139 |
+
### Accuracy
|
140 |
+
|
141 |
+
We provide a **zero-shot error** $\epsilon_d$ which is evaluated for 6 different datasets
|
142 |
+
(see [paper](https://arxiv.org/abs/1907.01341v3)). **Lower error values are better**.
|
143 |
+
$\color{green}{\textsf{Overall model quality is represented by the improvement}}$ ([Imp.](#improvement)) with respect to
|
144 |
+
MiDaS 3.0 DPT<sub>L-384</sub>. The models are grouped by the height used for inference, whereas the square training resolution is given by
|
145 |
+
the numbers in the model names. The table also shows the **number of parameters** (in millions) and the
|
146 |
+
**frames per second** for inference at the training resolution (for GPU RTX 3090):
|
147 |
+
|
148 |
+
| MiDaS Model | DIW </br><sup>WHDR</sup> | Eth3d </br><sup>AbsRel</sup> | Sintel </br><sup>AbsRel</sup> | TUM </br><sup>δ1</sup> | KITTI </br><sup>δ1</sup> | NYUv2 </br><sup>δ1</sup> | $\color{green}{\textsf{Imp.}}$ </br><sup>%</sup> | Par.</br><sup>M</sup> | FPS</br><sup> </sup> |
|
149 |
+
|-----------------------------------------------------------------------------------------------------------------------|-------------------------:|-----------------------------:|------------------------------:|-------------------------:|-------------------------:|-------------------------:|-------------------------------------------------:|----------------------:|--------------------------:|
|
150 |
+
| **Inference height 512** | | | | | | | | | |
|
151 |
+
| [v3.1 BEiT<sub>L-512</sub>](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt) | 0.1137 | 0.0659 | 0.2366 | **6.13** | 11.56* | **1.86*** | $\color{green}{\textsf{19}}$ | **345** | **5.7** |
|
152 |
+
| [v3.1 BEiT<sub>L-512</sub>](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt)$\tiny{\square}$ | **0.1121** | **0.0614** | **0.2090** | 6.46 | **5.00*** | 1.90* | $\color{green}{\textsf{34}}$ | **345** | **5.7** |
|
153 |
+
| | | | | | | | | | |
|
154 |
+
| **Inference height 384** | | | | | | | | | |
|
155 |
+
| [v3.1 BEiT<sub>L-512</sub>](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt) | 0.1245 | 0.0681 | **0.2176** | **6.13** | 6.28* | **2.16*** | $\color{green}{\textsf{28}}$ | 345 | 12 |
|
156 |
+
| [v3.1 Swin2<sub>L-384</sub>](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_large_384.pt)$\tiny{\square}$ | 0.1106 | 0.0732 | 0.2442 | 8.87 | **5.84*** | 2.92* | $\color{green}{\textsf{22}}$ | 213 | 41 |
|
157 |
+
| [v3.1 Swin2<sub>B-384</sub>](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_base_384.pt)$\tiny{\square}$ | 0.1095 | 0.0790 | 0.2404 | 8.93 | 5.97* | 3.28* | $\color{green}{\textsf{22}}$ | 102 | 39 |
|
158 |
+
| [v3.1 Swin<sub>L-384</sub>](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin_large_384.pt)$\tiny{\square}$ | 0.1126 | 0.0853 | 0.2428 | 8.74 | 6.60* | 3.34* | $\color{green}{\textsf{17}}$ | 213 | 49 |
|
159 |
+
| [v3.1 BEiT<sub>L-384</sub>](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_384.pt) | 0.1239 | **0.0667** | 0.2545 | 7.17 | 9.84* | 2.21* | $\color{green}{\textsf{17}}$ | 344 | 13 |
|
160 |
+
| [v3.1 Next-ViT<sub>L-384</sub>](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_next_vit_large_384.pt) | **0.1031** | 0.0954 | 0.2295 | 9.21 | 6.89* | 3.47* | $\color{green}{\textsf{16}}$ | **72** | 30 |
|
161 |
+
| [v3.1 BEiT<sub>B-384</sub>](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_base_384.pt) | 0.1159 | 0.0967 | 0.2901 | 9.88 | 26.60* | 3.91* | $\color{green}{\textsf{-31}}$ | 112 | 31 |
|
162 |
+
| [v3.0 DPT<sub>L-384</sub>](https://github.com/isl-org/MiDaS/releases/download/v3/dpt_large_384.pt) | 0.1082 | 0.0888 | 0.2697 | 9.97 | 8.46 | 8.32 | $\color{green}{\textsf{0}}$ | 344 | **61** |
|
163 |
+
| [v3.0 DPT<sub>H-384</sub>](https://github.com/isl-org/MiDaS/releases/download/v3/dpt_hybrid_384.pt) | 0.1106 | 0.0934 | 0.2741 | 10.89 | 11.56 | 8.69 | $\color{green}{\textsf{-10}}$ | 123 | 50 |
|
164 |
+
| [v2.1 Large<sub>384</sub>](https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_384.pt) | 0.1295 | 0.1155 | 0.3285 | 12.51 | 16.08 | 8.71 | $\color{green}{\textsf{-32}}$ | 105 | 47 |
|
165 |
+
| | | | | | | | | | |
|
166 |
+
| **Inference height 256** | | | | | | | | | |
|
167 |
+
| [v3.1 Swin2<sub>T-256</sub>](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_tiny_256.pt)$\tiny{\square}$ | **0.1211** | **0.1106** | **0.2868** | **13.43** | **10.13*** | **5.55*** | $\color{green}{\textsf{-11}}$ | 42 | 64 |
|
168 |
+
| [v2.1 Small<sub>256</sub>](https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt) | 0.1344 | 0.1344 | 0.3370 | 14.53 | 29.27 | 13.43 | $\color{green}{\textsf{-76}}$ | **21** | **90** |
|
169 |
+
| | | | | | | | | | |
|
170 |
+
| **Inference height 224** | | | | | | | | | |
|
171 |
+
| [v3.1 LeViT<sub>224</sub>](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_levit_224.pt)$\tiny{\square}$ | **0.1314** | **0.1206** | **0.3148** | **18.21** | **15.27*** | **8.64*** | $\color{green}{\textsf{-40}}$ | **51** | **73** |
|
172 |
+
|
173 |
+
* No zero-shot error, because models are also trained on KITTI and NYU Depth V2\
|
174 |
+
$\square$ Validation performed at **square resolution**, either because the transformer encoder backbone of a model
|
175 |
+
does not support non-square resolutions (Swin, Swin2, LeViT) or for comparison with these models. All other
|
176 |
+
validations keep the aspect ratio. A difference in resolution limits the comparability of the zero-shot error and the
|
177 |
+
improvement, because these quantities are averages over the pixels of an image and do not take into account the
|
178 |
+
advantage of more details due to a higher resolution.\
|
179 |
+
Best values per column and same validation height in bold
|
180 |
+
|
181 |
+
#### Improvement
|
182 |
+
|
183 |
+
The improvement in the above table is defined as the relative zero-shot error with respect to MiDaS v3.0
|
184 |
+
DPT<sub>L-384</sub> and averaging over the datasets. So, if $\epsilon_d$ is the zero-shot error for dataset $d$, then
|
185 |
+
the $\color{green}{\textsf{improvement}}$ is given by $100(1-(1/6)\sum_d\epsilon_d/\epsilon_{d,\rm{DPT_{L-384}}})$%.
|
186 |
+
|
187 |
+
Note that the improvements of 10% for MiDaS v2.0 → v2.1 and 21% for MiDaS v2.1 → v3.0 are not visible from the
|
188 |
+
improvement column (Imp.) in the table but would require an evaluation with respect to MiDaS v2.1 Large<sub>384</sub>
|
189 |
+
and v2.0 Large<sub>384</sub> respectively instead of v3.0 DPT<sub>L-384</sub>.
|
190 |
+
|
191 |
+
### Depth map comparison
|
192 |
+
|
193 |
+
Zoom in for better visibility
|
194 |
+
![](figures/Comparison.png)
|
195 |
+
|
196 |
+
### Speed on Camera Feed
|
197 |
+
|
198 |
+
Test configuration
|
199 |
+
- Windows 10
|
200 |
+
- 11th Gen Intel Core i7-1185G7 3.00GHz
|
201 |
+
- 16GB RAM
|
202 |
+
- Camera resolution 640x480
|
203 |
+
- openvino_midas_v21_small_256
|
204 |
+
|
205 |
+
Speed: 22 FPS
|
206 |
+
|
207 |
+
### Changelog
|
208 |
+
|
209 |
+
* [Dec 2022] Released MiDaS v3.1:
|
210 |
+
- New models based on 5 different types of transformers ([BEiT](https://arxiv.org/pdf/2106.08254.pdf), [Swin2](https://arxiv.org/pdf/2111.09883.pdf), [Swin](https://arxiv.org/pdf/2103.14030.pdf), [Next-ViT](https://arxiv.org/pdf/2207.05501.pdf), [LeViT](https://arxiv.org/pdf/2104.01136.pdf))
|
211 |
+
- Training datasets extended from 10 to 12, including also KITTI and NYU Depth V2 using [BTS](https://github.com/cleinc/bts) split
|
212 |
+
- Best model, BEiT<sub>Large 512</sub>, with resolution 512x512, is on average about [28% more accurate](#Accuracy) than MiDaS v3.0
|
213 |
+
- Integrated live depth estimation from camera feed
|
214 |
+
* [Sep 2021] Integrated to [Huggingface Spaces](https://huggingface.co/spaces) with [Gradio](https://github.com/gradio-app/gradio). See [Gradio Web Demo](https://huggingface.co/spaces/akhaliq/DPT-Large).
|
215 |
+
* [Apr 2021] Released MiDaS v3.0:
|
216 |
+
- New models based on [Dense Prediction Transformers](https://arxiv.org/abs/2103.13413) are on average [21% more accurate](#Accuracy) than MiDaS v2.1
|
217 |
+
- Additional models can be found [here](https://github.com/isl-org/DPT)
|
218 |
+
* [Nov 2020] Released MiDaS v2.1:
|
219 |
+
- New model that was trained on 10 datasets and is on average about [10% more accurate](#Accuracy) than [MiDaS v2.0](https://github.com/isl-org/MiDaS/releases/tag/v2)
|
220 |
+
- New light-weight model that achieves [real-time performance](https://github.com/isl-org/MiDaS/tree/master/mobile) on mobile platforms.
|
221 |
+
- Sample applications for [iOS](https://github.com/isl-org/MiDaS/tree/master/mobile/ios) and [Android](https://github.com/isl-org/MiDaS/tree/master/mobile/android)
|
222 |
+
- [ROS package](https://github.com/isl-org/MiDaS/tree/master/ros) for easy deployment on robots
|
223 |
+
* [Jul 2020] Added TensorFlow and ONNX code. Added [online demo](http://35.202.76.57/).
|
224 |
+
* [Dec 2019] Released new version of MiDaS - the new model is significantly more accurate and robust
|
225 |
+
* [Jul 2019] Initial release of MiDaS ([Link](https://github.com/isl-org/MiDaS/releases/tag/v1))
|
226 |
+
|
227 |
+
### Citation
|
228 |
+
|
229 |
+
Please cite our paper if you use this code or any of the models:
|
230 |
+
```
|
231 |
+
@ARTICLE {Ranftl2022,
|
232 |
+
author = "Ren\'{e} Ranftl and Katrin Lasinger and David Hafner and Konrad Schindler and Vladlen Koltun",
|
233 |
+
title = "Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-Shot Cross-Dataset Transfer",
|
234 |
+
journal = "IEEE Transactions on Pattern Analysis and Machine Intelligence",
|
235 |
+
year = "2022",
|
236 |
+
volume = "44",
|
237 |
+
number = "3"
|
238 |
+
}
|
239 |
+
```
|
240 |
+
|
241 |
+
If you use a DPT-based model, please also cite:
|
242 |
+
|
243 |
+
```
|
244 |
+
@article{Ranftl2021,
|
245 |
+
author = {Ren\'{e} Ranftl and Alexey Bochkovskiy and Vladlen Koltun},
|
246 |
+
title = {Vision Transformers for Dense Prediction},
|
247 |
+
journal = {ICCV},
|
248 |
+
year = {2021},
|
249 |
+
}
|
250 |
+
```
|
251 |
+
|
252 |
+
### Acknowledgements
|
253 |
+
|
254 |
+
Our work builds on and uses code from [timm](https://github.com/rwightman/pytorch-image-models) and [Next-ViT](https://github.com/bytedance/Next-ViT).
|
255 |
+
We'd like to thank the authors for making these libraries available.
|
256 |
+
|
257 |
+
### License
|
258 |
+
|
259 |
+
MIT License
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/environment.yaml
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: midas-py310
|
2 |
+
channels:
|
3 |
+
- pytorch
|
4 |
+
- defaults
|
5 |
+
dependencies:
|
6 |
+
- nvidia::cudatoolkit=11.7
|
7 |
+
- python=3.10.8
|
8 |
+
- pytorch::pytorch=1.13.0
|
9 |
+
- torchvision=0.14.0
|
10 |
+
- pip=22.3.1
|
11 |
+
- numpy=1.23.4
|
12 |
+
- pip:
|
13 |
+
- opencv-python==4.6.0.66
|
14 |
+
- imutils==0.5.4
|
15 |
+
- timm==0.6.12
|
16 |
+
- einops==0.6.0
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/hubconf.py
ADDED
@@ -0,0 +1,435 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dependencies = ["torch"]
|
2 |
+
|
3 |
+
import torch
|
4 |
+
|
5 |
+
from midas.dpt_depth import DPTDepthModel
|
6 |
+
from midas.midas_net import MidasNet
|
7 |
+
from midas.midas_net_custom import MidasNet_small
|
8 |
+
|
9 |
+
def DPT_BEiT_L_512(pretrained=True, **kwargs):
|
10 |
+
""" # This docstring shows up in hub.help()
|
11 |
+
MiDaS DPT_BEiT_L_512 model for monocular depth estimation
|
12 |
+
pretrained (bool): load pretrained weights into model
|
13 |
+
"""
|
14 |
+
|
15 |
+
model = DPTDepthModel(
|
16 |
+
path=None,
|
17 |
+
backbone="beitl16_512",
|
18 |
+
non_negative=True,
|
19 |
+
)
|
20 |
+
|
21 |
+
if pretrained:
|
22 |
+
checkpoint = (
|
23 |
+
"https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt"
|
24 |
+
)
|
25 |
+
state_dict = torch.hub.load_state_dict_from_url(
|
26 |
+
checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True
|
27 |
+
)
|
28 |
+
model.load_state_dict(state_dict)
|
29 |
+
|
30 |
+
return model
|
31 |
+
|
32 |
+
def DPT_BEiT_L_384(pretrained=True, **kwargs):
|
33 |
+
""" # This docstring shows up in hub.help()
|
34 |
+
MiDaS DPT_BEiT_L_384 model for monocular depth estimation
|
35 |
+
pretrained (bool): load pretrained weights into model
|
36 |
+
"""
|
37 |
+
|
38 |
+
model = DPTDepthModel(
|
39 |
+
path=None,
|
40 |
+
backbone="beitl16_384",
|
41 |
+
non_negative=True,
|
42 |
+
)
|
43 |
+
|
44 |
+
if pretrained:
|
45 |
+
checkpoint = (
|
46 |
+
"https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_384.pt"
|
47 |
+
)
|
48 |
+
state_dict = torch.hub.load_state_dict_from_url(
|
49 |
+
checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True
|
50 |
+
)
|
51 |
+
model.load_state_dict(state_dict)
|
52 |
+
|
53 |
+
return model
|
54 |
+
|
55 |
+
def DPT_BEiT_B_384(pretrained=True, **kwargs):
|
56 |
+
""" # This docstring shows up in hub.help()
|
57 |
+
MiDaS DPT_BEiT_B_384 model for monocular depth estimation
|
58 |
+
pretrained (bool): load pretrained weights into model
|
59 |
+
"""
|
60 |
+
|
61 |
+
model = DPTDepthModel(
|
62 |
+
path=None,
|
63 |
+
backbone="beitb16_384",
|
64 |
+
non_negative=True,
|
65 |
+
)
|
66 |
+
|
67 |
+
if pretrained:
|
68 |
+
checkpoint = (
|
69 |
+
"https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_base_384.pt"
|
70 |
+
)
|
71 |
+
state_dict = torch.hub.load_state_dict_from_url(
|
72 |
+
checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True
|
73 |
+
)
|
74 |
+
model.load_state_dict(state_dict)
|
75 |
+
|
76 |
+
return model
|
77 |
+
|
78 |
+
def DPT_SwinV2_L_384(pretrained=True, **kwargs):
|
79 |
+
""" # This docstring shows up in hub.help()
|
80 |
+
MiDaS DPT_SwinV2_L_384 model for monocular depth estimation
|
81 |
+
pretrained (bool): load pretrained weights into model
|
82 |
+
"""
|
83 |
+
|
84 |
+
model = DPTDepthModel(
|
85 |
+
path=None,
|
86 |
+
backbone="swin2l24_384",
|
87 |
+
non_negative=True,
|
88 |
+
)
|
89 |
+
|
90 |
+
if pretrained:
|
91 |
+
checkpoint = (
|
92 |
+
"https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_large_384.pt"
|
93 |
+
)
|
94 |
+
state_dict = torch.hub.load_state_dict_from_url(
|
95 |
+
checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True
|
96 |
+
)
|
97 |
+
model.load_state_dict(state_dict)
|
98 |
+
|
99 |
+
return model
|
100 |
+
|
101 |
+
def DPT_SwinV2_B_384(pretrained=True, **kwargs):
|
102 |
+
""" # This docstring shows up in hub.help()
|
103 |
+
MiDaS DPT_SwinV2_B_384 model for monocular depth estimation
|
104 |
+
pretrained (bool): load pretrained weights into model
|
105 |
+
"""
|
106 |
+
|
107 |
+
model = DPTDepthModel(
|
108 |
+
path=None,
|
109 |
+
backbone="swin2b24_384",
|
110 |
+
non_negative=True,
|
111 |
+
)
|
112 |
+
|
113 |
+
if pretrained:
|
114 |
+
checkpoint = (
|
115 |
+
"https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_base_384.pt"
|
116 |
+
)
|
117 |
+
state_dict = torch.hub.load_state_dict_from_url(
|
118 |
+
checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True
|
119 |
+
)
|
120 |
+
model.load_state_dict(state_dict)
|
121 |
+
|
122 |
+
return model
|
123 |
+
|
124 |
+
def DPT_SwinV2_T_256(pretrained=True, **kwargs):
|
125 |
+
""" # This docstring shows up in hub.help()
|
126 |
+
MiDaS DPT_SwinV2_T_256 model for monocular depth estimation
|
127 |
+
pretrained (bool): load pretrained weights into model
|
128 |
+
"""
|
129 |
+
|
130 |
+
model = DPTDepthModel(
|
131 |
+
path=None,
|
132 |
+
backbone="swin2t16_256",
|
133 |
+
non_negative=True,
|
134 |
+
)
|
135 |
+
|
136 |
+
if pretrained:
|
137 |
+
checkpoint = (
|
138 |
+
"https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_tiny_256.pt"
|
139 |
+
)
|
140 |
+
state_dict = torch.hub.load_state_dict_from_url(
|
141 |
+
checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True
|
142 |
+
)
|
143 |
+
model.load_state_dict(state_dict)
|
144 |
+
|
145 |
+
return model
|
146 |
+
|
147 |
+
def DPT_Swin_L_384(pretrained=True, **kwargs):
|
148 |
+
""" # This docstring shows up in hub.help()
|
149 |
+
MiDaS DPT_Swin_L_384 model for monocular depth estimation
|
150 |
+
pretrained (bool): load pretrained weights into model
|
151 |
+
"""
|
152 |
+
|
153 |
+
model = DPTDepthModel(
|
154 |
+
path=None,
|
155 |
+
backbone="swinl12_384",
|
156 |
+
non_negative=True,
|
157 |
+
)
|
158 |
+
|
159 |
+
if pretrained:
|
160 |
+
checkpoint = (
|
161 |
+
"https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin_large_384.pt"
|
162 |
+
)
|
163 |
+
state_dict = torch.hub.load_state_dict_from_url(
|
164 |
+
checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True
|
165 |
+
)
|
166 |
+
model.load_state_dict(state_dict)
|
167 |
+
|
168 |
+
return model
|
169 |
+
|
170 |
+
def DPT_Next_ViT_L_384(pretrained=True, **kwargs):
|
171 |
+
""" # This docstring shows up in hub.help()
|
172 |
+
MiDaS DPT_Next_ViT_L_384 model for monocular depth estimation
|
173 |
+
pretrained (bool): load pretrained weights into model
|
174 |
+
"""
|
175 |
+
|
176 |
+
model = DPTDepthModel(
|
177 |
+
path=None,
|
178 |
+
backbone="next_vit_large_6m",
|
179 |
+
non_negative=True,
|
180 |
+
)
|
181 |
+
|
182 |
+
if pretrained:
|
183 |
+
checkpoint = (
|
184 |
+
"https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_next_vit_large_384.pt"
|
185 |
+
)
|
186 |
+
state_dict = torch.hub.load_state_dict_from_url(
|
187 |
+
checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True
|
188 |
+
)
|
189 |
+
model.load_state_dict(state_dict)
|
190 |
+
|
191 |
+
return model
|
192 |
+
|
193 |
+
def DPT_LeViT_224(pretrained=True, **kwargs):
|
194 |
+
""" # This docstring shows up in hub.help()
|
195 |
+
MiDaS DPT_LeViT_224 model for monocular depth estimation
|
196 |
+
pretrained (bool): load pretrained weights into model
|
197 |
+
"""
|
198 |
+
|
199 |
+
model = DPTDepthModel(
|
200 |
+
path=None,
|
201 |
+
backbone="levit_384",
|
202 |
+
non_negative=True,
|
203 |
+
head_features_1=64,
|
204 |
+
head_features_2=8,
|
205 |
+
)
|
206 |
+
|
207 |
+
if pretrained:
|
208 |
+
checkpoint = (
|
209 |
+
"https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_levit_224.pt"
|
210 |
+
)
|
211 |
+
state_dict = torch.hub.load_state_dict_from_url(
|
212 |
+
checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True
|
213 |
+
)
|
214 |
+
model.load_state_dict(state_dict)
|
215 |
+
|
216 |
+
return model
|
217 |
+
|
218 |
+
def DPT_Large(pretrained=True, **kwargs):
|
219 |
+
""" # This docstring shows up in hub.help()
|
220 |
+
MiDaS DPT-Large model for monocular depth estimation
|
221 |
+
pretrained (bool): load pretrained weights into model
|
222 |
+
"""
|
223 |
+
|
224 |
+
model = DPTDepthModel(
|
225 |
+
path=None,
|
226 |
+
backbone="vitl16_384",
|
227 |
+
non_negative=True,
|
228 |
+
)
|
229 |
+
|
230 |
+
if pretrained:
|
231 |
+
checkpoint = (
|
232 |
+
"https://github.com/isl-org/MiDaS/releases/download/v3/dpt_large_384.pt"
|
233 |
+
)
|
234 |
+
state_dict = torch.hub.load_state_dict_from_url(
|
235 |
+
checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True
|
236 |
+
)
|
237 |
+
model.load_state_dict(state_dict)
|
238 |
+
|
239 |
+
return model
|
240 |
+
|
241 |
+
def DPT_Hybrid(pretrained=True, **kwargs):
|
242 |
+
""" # This docstring shows up in hub.help()
|
243 |
+
MiDaS DPT-Hybrid model for monocular depth estimation
|
244 |
+
pretrained (bool): load pretrained weights into model
|
245 |
+
"""
|
246 |
+
|
247 |
+
model = DPTDepthModel(
|
248 |
+
path=None,
|
249 |
+
backbone="vitb_rn50_384",
|
250 |
+
non_negative=True,
|
251 |
+
)
|
252 |
+
|
253 |
+
if pretrained:
|
254 |
+
checkpoint = (
|
255 |
+
"https://github.com/isl-org/MiDaS/releases/download/v3/dpt_hybrid_384.pt"
|
256 |
+
)
|
257 |
+
state_dict = torch.hub.load_state_dict_from_url(
|
258 |
+
checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True
|
259 |
+
)
|
260 |
+
model.load_state_dict(state_dict)
|
261 |
+
|
262 |
+
return model
|
263 |
+
|
264 |
+
def MiDaS(pretrained=True, **kwargs):
|
265 |
+
""" # This docstring shows up in hub.help()
|
266 |
+
MiDaS v2.1 model for monocular depth estimation
|
267 |
+
pretrained (bool): load pretrained weights into model
|
268 |
+
"""
|
269 |
+
|
270 |
+
model = MidasNet()
|
271 |
+
|
272 |
+
if pretrained:
|
273 |
+
checkpoint = (
|
274 |
+
"https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_384.pt"
|
275 |
+
)
|
276 |
+
state_dict = torch.hub.load_state_dict_from_url(
|
277 |
+
checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True
|
278 |
+
)
|
279 |
+
model.load_state_dict(state_dict)
|
280 |
+
|
281 |
+
return model
|
282 |
+
|
283 |
+
def MiDaS_small(pretrained=True, **kwargs):
|
284 |
+
""" # This docstring shows up in hub.help()
|
285 |
+
MiDaS v2.1 small model for monocular depth estimation on resource-constrained devices
|
286 |
+
pretrained (bool): load pretrained weights into model
|
287 |
+
"""
|
288 |
+
|
289 |
+
model = MidasNet_small(None, features=64, backbone="efficientnet_lite3", exportable=True, non_negative=True, blocks={'expand': True})
|
290 |
+
|
291 |
+
if pretrained:
|
292 |
+
checkpoint = (
|
293 |
+
"https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt"
|
294 |
+
)
|
295 |
+
state_dict = torch.hub.load_state_dict_from_url(
|
296 |
+
checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True
|
297 |
+
)
|
298 |
+
model.load_state_dict(state_dict)
|
299 |
+
|
300 |
+
return model
|
301 |
+
|
302 |
+
|
303 |
+
def transforms():
|
304 |
+
import cv2
|
305 |
+
from torchvision.transforms import Compose
|
306 |
+
from midas.transforms import Resize, NormalizeImage, PrepareForNet
|
307 |
+
from midas import transforms
|
308 |
+
|
309 |
+
transforms.default_transform = Compose(
|
310 |
+
[
|
311 |
+
lambda img: {"image": img / 255.0},
|
312 |
+
Resize(
|
313 |
+
384,
|
314 |
+
384,
|
315 |
+
resize_target=None,
|
316 |
+
keep_aspect_ratio=True,
|
317 |
+
ensure_multiple_of=32,
|
318 |
+
resize_method="upper_bound",
|
319 |
+
image_interpolation_method=cv2.INTER_CUBIC,
|
320 |
+
),
|
321 |
+
NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
|
322 |
+
PrepareForNet(),
|
323 |
+
lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0),
|
324 |
+
]
|
325 |
+
)
|
326 |
+
|
327 |
+
transforms.small_transform = Compose(
|
328 |
+
[
|
329 |
+
lambda img: {"image": img / 255.0},
|
330 |
+
Resize(
|
331 |
+
256,
|
332 |
+
256,
|
333 |
+
resize_target=None,
|
334 |
+
keep_aspect_ratio=True,
|
335 |
+
ensure_multiple_of=32,
|
336 |
+
resize_method="upper_bound",
|
337 |
+
image_interpolation_method=cv2.INTER_CUBIC,
|
338 |
+
),
|
339 |
+
NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
|
340 |
+
PrepareForNet(),
|
341 |
+
lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0),
|
342 |
+
]
|
343 |
+
)
|
344 |
+
|
345 |
+
transforms.dpt_transform = Compose(
|
346 |
+
[
|
347 |
+
lambda img: {"image": img / 255.0},
|
348 |
+
Resize(
|
349 |
+
384,
|
350 |
+
384,
|
351 |
+
resize_target=None,
|
352 |
+
keep_aspect_ratio=True,
|
353 |
+
ensure_multiple_of=32,
|
354 |
+
resize_method="minimal",
|
355 |
+
image_interpolation_method=cv2.INTER_CUBIC,
|
356 |
+
),
|
357 |
+
NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
|
358 |
+
PrepareForNet(),
|
359 |
+
lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0),
|
360 |
+
]
|
361 |
+
)
|
362 |
+
|
363 |
+
transforms.beit512_transform = Compose(
|
364 |
+
[
|
365 |
+
lambda img: {"image": img / 255.0},
|
366 |
+
Resize(
|
367 |
+
512,
|
368 |
+
512,
|
369 |
+
resize_target=None,
|
370 |
+
keep_aspect_ratio=True,
|
371 |
+
ensure_multiple_of=32,
|
372 |
+
resize_method="minimal",
|
373 |
+
image_interpolation_method=cv2.INTER_CUBIC,
|
374 |
+
),
|
375 |
+
NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
|
376 |
+
PrepareForNet(),
|
377 |
+
lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0),
|
378 |
+
]
|
379 |
+
)
|
380 |
+
|
381 |
+
transforms.swin384_transform = Compose(
|
382 |
+
[
|
383 |
+
lambda img: {"image": img / 255.0},
|
384 |
+
Resize(
|
385 |
+
384,
|
386 |
+
384,
|
387 |
+
resize_target=None,
|
388 |
+
keep_aspect_ratio=False,
|
389 |
+
ensure_multiple_of=32,
|
390 |
+
resize_method="minimal",
|
391 |
+
image_interpolation_method=cv2.INTER_CUBIC,
|
392 |
+
),
|
393 |
+
NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
|
394 |
+
PrepareForNet(),
|
395 |
+
lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0),
|
396 |
+
]
|
397 |
+
)
|
398 |
+
|
399 |
+
transforms.swin256_transform = Compose(
|
400 |
+
[
|
401 |
+
lambda img: {"image": img / 255.0},
|
402 |
+
Resize(
|
403 |
+
256,
|
404 |
+
256,
|
405 |
+
resize_target=None,
|
406 |
+
keep_aspect_ratio=False,
|
407 |
+
ensure_multiple_of=32,
|
408 |
+
resize_method="minimal",
|
409 |
+
image_interpolation_method=cv2.INTER_CUBIC,
|
410 |
+
),
|
411 |
+
NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
|
412 |
+
PrepareForNet(),
|
413 |
+
lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0),
|
414 |
+
]
|
415 |
+
)
|
416 |
+
|
417 |
+
transforms.levit_transform = Compose(
|
418 |
+
[
|
419 |
+
lambda img: {"image": img / 255.0},
|
420 |
+
Resize(
|
421 |
+
224,
|
422 |
+
224,
|
423 |
+
resize_target=None,
|
424 |
+
keep_aspect_ratio=False,
|
425 |
+
ensure_multiple_of=32,
|
426 |
+
resize_method="minimal",
|
427 |
+
image_interpolation_method=cv2.INTER_CUBIC,
|
428 |
+
),
|
429 |
+
NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
|
430 |
+
PrepareForNet(),
|
431 |
+
lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0),
|
432 |
+
]
|
433 |
+
)
|
434 |
+
|
435 |
+
return transforms
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/input/.placeholder
ADDED
File without changes
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/beit.py
ADDED
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import timm
|
2 |
+
import torch
|
3 |
+
import types
|
4 |
+
|
5 |
+
import numpy as np
|
6 |
+
import torch.nn.functional as F
|
7 |
+
|
8 |
+
from .utils import forward_adapted_unflatten, make_backbone_default
|
9 |
+
from timm.models.beit import gen_relative_position_index
|
10 |
+
from torch.utils.checkpoint import checkpoint
|
11 |
+
from typing import Optional
|
12 |
+
|
13 |
+
|
14 |
+
def forward_beit(pretrained, x):
|
15 |
+
return forward_adapted_unflatten(pretrained, x, "forward_features")
|
16 |
+
|
17 |
+
|
18 |
+
def patch_embed_forward(self, x):
|
19 |
+
"""
|
20 |
+
Modification of timm.models.layers.patch_embed.py: PatchEmbed.forward to support arbitrary window sizes.
|
21 |
+
"""
|
22 |
+
x = self.proj(x)
|
23 |
+
if self.flatten:
|
24 |
+
x = x.flatten(2).transpose(1, 2)
|
25 |
+
x = self.norm(x)
|
26 |
+
return x
|
27 |
+
|
28 |
+
|
29 |
+
def _get_rel_pos_bias(self, window_size):
|
30 |
+
"""
|
31 |
+
Modification of timm.models.beit.py: Attention._get_rel_pos_bias to support arbitrary window sizes.
|
32 |
+
"""
|
33 |
+
old_height = 2 * self.window_size[0] - 1
|
34 |
+
old_width = 2 * self.window_size[1] - 1
|
35 |
+
|
36 |
+
new_height = 2 * window_size[0] - 1
|
37 |
+
new_width = 2 * window_size[1] - 1
|
38 |
+
|
39 |
+
old_relative_position_bias_table = self.relative_position_bias_table
|
40 |
+
|
41 |
+
old_num_relative_distance = self.num_relative_distance
|
42 |
+
new_num_relative_distance = new_height * new_width + 3
|
43 |
+
|
44 |
+
old_sub_table = old_relative_position_bias_table[:old_num_relative_distance - 3]
|
45 |
+
|
46 |
+
old_sub_table = old_sub_table.reshape(1, old_width, old_height, -1).permute(0, 3, 1, 2)
|
47 |
+
new_sub_table = F.interpolate(old_sub_table, size=(new_height, new_width), mode="bilinear")
|
48 |
+
new_sub_table = new_sub_table.permute(0, 2, 3, 1).reshape(new_num_relative_distance - 3, -1)
|
49 |
+
|
50 |
+
new_relative_position_bias_table = torch.cat(
|
51 |
+
[new_sub_table, old_relative_position_bias_table[old_num_relative_distance - 3:]])
|
52 |
+
|
53 |
+
key = str(window_size[1]) + "," + str(window_size[0])
|
54 |
+
if key not in self.relative_position_indices.keys():
|
55 |
+
self.relative_position_indices[key] = gen_relative_position_index(window_size)
|
56 |
+
|
57 |
+
relative_position_bias = new_relative_position_bias_table[
|
58 |
+
self.relative_position_indices[key].view(-1)].view(
|
59 |
+
window_size[0] * window_size[1] + 1,
|
60 |
+
window_size[0] * window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH
|
61 |
+
relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww
|
62 |
+
return relative_position_bias.unsqueeze(0)
|
63 |
+
|
64 |
+
|
65 |
+
def attention_forward(self, x, resolution, shared_rel_pos_bias: Optional[torch.Tensor] = None):
|
66 |
+
"""
|
67 |
+
Modification of timm.models.beit.py: Attention.forward to support arbitrary window sizes.
|
68 |
+
"""
|
69 |
+
B, N, C = x.shape
|
70 |
+
|
71 |
+
qkv_bias = torch.cat((self.q_bias, self.k_bias, self.v_bias)) if self.q_bias is not None else None
|
72 |
+
qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
|
73 |
+
qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
|
74 |
+
q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple)
|
75 |
+
|
76 |
+
q = q * self.scale
|
77 |
+
attn = (q @ k.transpose(-2, -1))
|
78 |
+
|
79 |
+
if self.relative_position_bias_table is not None:
|
80 |
+
window_size = tuple(np.array(resolution) // 16)
|
81 |
+
attn = attn + self._get_rel_pos_bias(window_size)
|
82 |
+
if shared_rel_pos_bias is not None:
|
83 |
+
attn = attn + shared_rel_pos_bias
|
84 |
+
|
85 |
+
attn = attn.softmax(dim=-1)
|
86 |
+
attn = self.attn_drop(attn)
|
87 |
+
|
88 |
+
x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
|
89 |
+
x = self.proj(x)
|
90 |
+
x = self.proj_drop(x)
|
91 |
+
return x
|
92 |
+
|
93 |
+
|
94 |
+
def block_forward(self, x, resolution, shared_rel_pos_bias: Optional[torch.Tensor] = None):
|
95 |
+
"""
|
96 |
+
Modification of timm.models.beit.py: Block.forward to support arbitrary window sizes.
|
97 |
+
"""
|
98 |
+
if self.gamma_1 is None:
|
99 |
+
x = x + self.drop_path(self.attn(self.norm1(x), resolution, shared_rel_pos_bias=shared_rel_pos_bias))
|
100 |
+
x = x + self.drop_path(self.mlp(self.norm2(x)))
|
101 |
+
else:
|
102 |
+
x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x), resolution,
|
103 |
+
shared_rel_pos_bias=shared_rel_pos_bias))
|
104 |
+
x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
|
105 |
+
return x
|
106 |
+
|
107 |
+
|
108 |
+
def beit_forward_features(self, x):
|
109 |
+
"""
|
110 |
+
Modification of timm.models.beit.py: Beit.forward_features to support arbitrary window sizes.
|
111 |
+
"""
|
112 |
+
resolution = x.shape[2:]
|
113 |
+
|
114 |
+
x = self.patch_embed(x)
|
115 |
+
x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
|
116 |
+
if self.pos_embed is not None:
|
117 |
+
x = x + self.pos_embed
|
118 |
+
x = self.pos_drop(x)
|
119 |
+
|
120 |
+
rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None
|
121 |
+
for blk in self.blocks:
|
122 |
+
if self.grad_checkpointing and not torch.jit.is_scripting():
|
123 |
+
x = checkpoint(blk, x, shared_rel_pos_bias=rel_pos_bias)
|
124 |
+
else:
|
125 |
+
x = blk(x, resolution, shared_rel_pos_bias=rel_pos_bias)
|
126 |
+
x = self.norm(x)
|
127 |
+
return x
|
128 |
+
|
129 |
+
|
130 |
+
def _make_beit_backbone(
|
131 |
+
model,
|
132 |
+
features=[96, 192, 384, 768],
|
133 |
+
size=[384, 384],
|
134 |
+
hooks=[0, 4, 8, 11],
|
135 |
+
vit_features=768,
|
136 |
+
use_readout="ignore",
|
137 |
+
start_index=1,
|
138 |
+
start_index_readout=1,
|
139 |
+
):
|
140 |
+
backbone = make_backbone_default(model, features, size, hooks, vit_features, use_readout, start_index,
|
141 |
+
start_index_readout)
|
142 |
+
|
143 |
+
backbone.model.patch_embed.forward = types.MethodType(patch_embed_forward, backbone.model.patch_embed)
|
144 |
+
backbone.model.forward_features = types.MethodType(beit_forward_features, backbone.model)
|
145 |
+
|
146 |
+
for block in backbone.model.blocks:
|
147 |
+
attn = block.attn
|
148 |
+
attn._get_rel_pos_bias = types.MethodType(_get_rel_pos_bias, attn)
|
149 |
+
attn.forward = types.MethodType(attention_forward, attn)
|
150 |
+
attn.relative_position_indices = {}
|
151 |
+
|
152 |
+
block.forward = types.MethodType(block_forward, block)
|
153 |
+
|
154 |
+
return backbone
|
155 |
+
|
156 |
+
|
157 |
+
def _make_pretrained_beitl16_512(pretrained, use_readout="ignore", hooks=None):
|
158 |
+
model = timm.create_model("beit_large_patch16_512", pretrained=pretrained)
|
159 |
+
|
160 |
+
hooks = [5, 11, 17, 23] if hooks is None else hooks
|
161 |
+
|
162 |
+
features = [256, 512, 1024, 1024]
|
163 |
+
|
164 |
+
return _make_beit_backbone(
|
165 |
+
model,
|
166 |
+
features=features,
|
167 |
+
size=[512, 512],
|
168 |
+
hooks=hooks,
|
169 |
+
vit_features=1024,
|
170 |
+
use_readout=use_readout,
|
171 |
+
)
|
172 |
+
|
173 |
+
|
174 |
+
def _make_pretrained_beitl16_384(pretrained, use_readout="ignore", hooks=None):
|
175 |
+
model = timm.create_model("beit_large_patch16_384", pretrained=pretrained)
|
176 |
+
|
177 |
+
hooks = [5, 11, 17, 23] if hooks is None else hooks
|
178 |
+
return _make_beit_backbone(
|
179 |
+
model,
|
180 |
+
features=[256, 512, 1024, 1024],
|
181 |
+
hooks=hooks,
|
182 |
+
vit_features=1024,
|
183 |
+
use_readout=use_readout,
|
184 |
+
)
|
185 |
+
|
186 |
+
|
187 |
+
def _make_pretrained_beitb16_384(pretrained, use_readout="ignore", hooks=None):
|
188 |
+
model = timm.create_model("beit_base_patch16_384", pretrained=pretrained)
|
189 |
+
|
190 |
+
hooks = [2, 5, 8, 11] if hooks is None else hooks
|
191 |
+
return _make_beit_backbone(
|
192 |
+
model,
|
193 |
+
features=[96, 192, 384, 768],
|
194 |
+
hooks=hooks,
|
195 |
+
use_readout=use_readout,
|
196 |
+
)
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/levit.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import timm
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
from .utils import activations, get_activation, Transpose
|
7 |
+
|
8 |
+
|
9 |
+
def forward_levit(pretrained, x):
|
10 |
+
pretrained.model.forward_features(x)
|
11 |
+
|
12 |
+
layer_1 = pretrained.activations["1"]
|
13 |
+
layer_2 = pretrained.activations["2"]
|
14 |
+
layer_3 = pretrained.activations["3"]
|
15 |
+
|
16 |
+
layer_1 = pretrained.act_postprocess1(layer_1)
|
17 |
+
layer_2 = pretrained.act_postprocess2(layer_2)
|
18 |
+
layer_3 = pretrained.act_postprocess3(layer_3)
|
19 |
+
|
20 |
+
return layer_1, layer_2, layer_3
|
21 |
+
|
22 |
+
|
23 |
+
def _make_levit_backbone(
|
24 |
+
model,
|
25 |
+
hooks=[3, 11, 21],
|
26 |
+
patch_grid=[14, 14]
|
27 |
+
):
|
28 |
+
pretrained = nn.Module()
|
29 |
+
|
30 |
+
pretrained.model = model
|
31 |
+
pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
|
32 |
+
pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
|
33 |
+
pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
|
34 |
+
|
35 |
+
pretrained.activations = activations
|
36 |
+
|
37 |
+
patch_grid_size = np.array(patch_grid, dtype=int)
|
38 |
+
|
39 |
+
pretrained.act_postprocess1 = nn.Sequential(
|
40 |
+
Transpose(1, 2),
|
41 |
+
nn.Unflatten(2, torch.Size(patch_grid_size.tolist()))
|
42 |
+
)
|
43 |
+
pretrained.act_postprocess2 = nn.Sequential(
|
44 |
+
Transpose(1, 2),
|
45 |
+
nn.Unflatten(2, torch.Size((np.ceil(patch_grid_size / 2).astype(int)).tolist()))
|
46 |
+
)
|
47 |
+
pretrained.act_postprocess3 = nn.Sequential(
|
48 |
+
Transpose(1, 2),
|
49 |
+
nn.Unflatten(2, torch.Size((np.ceil(patch_grid_size / 4).astype(int)).tolist()))
|
50 |
+
)
|
51 |
+
|
52 |
+
return pretrained
|
53 |
+
|
54 |
+
|
55 |
+
class ConvTransposeNorm(nn.Sequential):
|
56 |
+
"""
|
57 |
+
Modification of
|
58 |
+
https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/levit.py: ConvNorm
|
59 |
+
such that ConvTranspose2d is used instead of Conv2d.
|
60 |
+
"""
|
61 |
+
|
62 |
+
def __init__(
|
63 |
+
self, in_chs, out_chs, kernel_size=1, stride=1, pad=0, dilation=1,
|
64 |
+
groups=1, bn_weight_init=1):
|
65 |
+
super().__init__()
|
66 |
+
self.add_module('c',
|
67 |
+
nn.ConvTranspose2d(in_chs, out_chs, kernel_size, stride, pad, dilation, groups, bias=False))
|
68 |
+
self.add_module('bn', nn.BatchNorm2d(out_chs))
|
69 |
+
|
70 |
+
nn.init.constant_(self.bn.weight, bn_weight_init)
|
71 |
+
|
72 |
+
@torch.no_grad()
|
73 |
+
def fuse(self):
|
74 |
+
c, bn = self._modules.values()
|
75 |
+
w = bn.weight / (bn.running_var + bn.eps) ** 0.5
|
76 |
+
w = c.weight * w[:, None, None, None]
|
77 |
+
b = bn.bias - bn.running_mean * bn.weight / (bn.running_var + bn.eps) ** 0.5
|
78 |
+
m = nn.ConvTranspose2d(
|
79 |
+
w.size(1), w.size(0), w.shape[2:], stride=self.c.stride,
|
80 |
+
padding=self.c.padding, dilation=self.c.dilation, groups=self.c.groups)
|
81 |
+
m.weight.data.copy_(w)
|
82 |
+
m.bias.data.copy_(b)
|
83 |
+
return m
|
84 |
+
|
85 |
+
|
86 |
+
def stem_b4_transpose(in_chs, out_chs, activation):
|
87 |
+
"""
|
88 |
+
Modification of
|
89 |
+
https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/levit.py: stem_b16
|
90 |
+
such that ConvTranspose2d is used instead of Conv2d and stem is also reduced to the half.
|
91 |
+
"""
|
92 |
+
return nn.Sequential(
|
93 |
+
ConvTransposeNorm(in_chs, out_chs, 3, 2, 1),
|
94 |
+
activation(),
|
95 |
+
ConvTransposeNorm(out_chs, out_chs // 2, 3, 2, 1),
|
96 |
+
activation())
|
97 |
+
|
98 |
+
|
99 |
+
def _make_pretrained_levit_384(pretrained, hooks=None):
|
100 |
+
model = timm.create_model("levit_384", pretrained=pretrained)
|
101 |
+
|
102 |
+
hooks = [3, 11, 21] if hooks == None else hooks
|
103 |
+
return _make_levit_backbone(
|
104 |
+
model,
|
105 |
+
hooks=hooks
|
106 |
+
)
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/next_vit.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import timm
|
2 |
+
|
3 |
+
import torch.nn as nn
|
4 |
+
|
5 |
+
from pathlib import Path
|
6 |
+
from .utils import activations, forward_default, get_activation
|
7 |
+
|
8 |
+
from ..external.next_vit.classification.nextvit import *
|
9 |
+
|
10 |
+
|
11 |
+
def forward_next_vit(pretrained, x):
|
12 |
+
return forward_default(pretrained, x, "forward")
|
13 |
+
|
14 |
+
|
15 |
+
def _make_next_vit_backbone(
|
16 |
+
model,
|
17 |
+
hooks=[2, 6, 36, 39],
|
18 |
+
):
|
19 |
+
pretrained = nn.Module()
|
20 |
+
|
21 |
+
pretrained.model = model
|
22 |
+
pretrained.model.features[hooks[0]].register_forward_hook(get_activation("1"))
|
23 |
+
pretrained.model.features[hooks[1]].register_forward_hook(get_activation("2"))
|
24 |
+
pretrained.model.features[hooks[2]].register_forward_hook(get_activation("3"))
|
25 |
+
pretrained.model.features[hooks[3]].register_forward_hook(get_activation("4"))
|
26 |
+
|
27 |
+
pretrained.activations = activations
|
28 |
+
|
29 |
+
return pretrained
|
30 |
+
|
31 |
+
|
32 |
+
def _make_pretrained_next_vit_large_6m(hooks=None):
|
33 |
+
model = timm.create_model("nextvit_large")
|
34 |
+
|
35 |
+
hooks = [2, 6, 36, 39] if hooks == None else hooks
|
36 |
+
return _make_next_vit_backbone(
|
37 |
+
model,
|
38 |
+
hooks=hooks,
|
39 |
+
)
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/swin.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import timm
|
2 |
+
|
3 |
+
from .swin_common import _make_swin_backbone
|
4 |
+
|
5 |
+
|
6 |
+
def _make_pretrained_swinl12_384(pretrained, hooks=None):
|
7 |
+
model = timm.create_model("swin_large_patch4_window12_384", pretrained=pretrained)
|
8 |
+
|
9 |
+
hooks = [1, 1, 17, 1] if hooks == None else hooks
|
10 |
+
return _make_swin_backbone(
|
11 |
+
model,
|
12 |
+
hooks=hooks
|
13 |
+
)
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/swin2.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import timm
|
2 |
+
|
3 |
+
from .swin_common import _make_swin_backbone
|
4 |
+
|
5 |
+
|
6 |
+
def _make_pretrained_swin2l24_384(pretrained, hooks=None):
|
7 |
+
model = timm.create_model("swinv2_large_window12to24_192to384_22kft1k", pretrained=pretrained)
|
8 |
+
|
9 |
+
hooks = [1, 1, 17, 1] if hooks == None else hooks
|
10 |
+
return _make_swin_backbone(
|
11 |
+
model,
|
12 |
+
hooks=hooks
|
13 |
+
)
|
14 |
+
|
15 |
+
|
16 |
+
def _make_pretrained_swin2b24_384(pretrained, hooks=None):
|
17 |
+
model = timm.create_model("swinv2_base_window12to24_192to384_22kft1k", pretrained=pretrained)
|
18 |
+
|
19 |
+
hooks = [1, 1, 17, 1] if hooks == None else hooks
|
20 |
+
return _make_swin_backbone(
|
21 |
+
model,
|
22 |
+
hooks=hooks
|
23 |
+
)
|
24 |
+
|
25 |
+
|
26 |
+
def _make_pretrained_swin2t16_256(pretrained, hooks=None):
|
27 |
+
model = timm.create_model("swinv2_tiny_window16_256", pretrained=pretrained)
|
28 |
+
|
29 |
+
hooks = [1, 1, 5, 1] if hooks == None else hooks
|
30 |
+
return _make_swin_backbone(
|
31 |
+
model,
|
32 |
+
hooks=hooks,
|
33 |
+
patch_grid=[64, 64]
|
34 |
+
)
|