toto10 commited on
Commit
4d244e3
1 Parent(s): 605f6d5

5c92d4965b9c0b7f421c923eaf90c134e60fd25cc660369d2b79e6de7c3e2f37

Browse files
Files changed (50) hide show
  1. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/deeplabv3_unet_s5-d16.py +50 -0
  2. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/deeplabv3plus_r50-d8.py +46 -0
  3. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/dmnet_r50-d8.py +44 -0
  4. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/dnl_r50-d8.py +46 -0
  5. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/emanet_r50-d8.py +47 -0
  6. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/encnet_r50-d8.py +48 -0
  7. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/fast_scnn.py +57 -0
  8. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/fcn_hr18.py +52 -0
  9. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/fcn_r50-d8.py +45 -0
  10. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/fcn_unet_s5-d16.py +51 -0
  11. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/fpn_r50.py +36 -0
  12. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/fpn_uniformer.py +35 -0
  13. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/gcnet_r50-d8.py +46 -0
  14. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/lraspp_m-v3-d8.py +25 -0
  15. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/nonlocal_r50-d8.py +46 -0
  16. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/ocrnet_hr18.py +68 -0
  17. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/ocrnet_r50-d8.py +47 -0
  18. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/pointrend_r50.py +56 -0
  19. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/psanet_r50-d8.py +49 -0
  20. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/pspnet_r50-d8.py +44 -0
  21. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/pspnet_unet_s5-d16.py +50 -0
  22. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/upernet_r50.py +44 -0
  23. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/upernet_uniformer.py +43 -0
  24. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/schedules/schedule_160k.py +9 -0
  25. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/schedules/schedule_20k.py +9 -0
  26. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/schedules/schedule_40k.py +9 -0
  27. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/schedules/schedule_80k.py +9 -0
  28. extensions/microsoftexcel-controlnet/annotator/uniformer/inference.py +144 -0
  29. extensions/microsoftexcel-controlnet/annotator/uniformer/mmcv_custom/__init__.py +5 -0
  30. extensions/microsoftexcel-controlnet/annotator/uniformer/mmcv_custom/checkpoint.py +508 -0
  31. extensions/microsoftexcel-controlnet/annotator/uniformer/uniformer.py +426 -0
  32. extensions/microsoftexcel-controlnet/annotator/uniformer/upernet_global_small.py +44 -0
  33. extensions/microsoftexcel-controlnet/annotator/util.py +79 -0
  34. extensions/microsoftexcel-controlnet/annotator/zoe/LICENSE +21 -0
  35. extensions/microsoftexcel-controlnet/annotator/zoe/__init__.py +59 -0
  36. extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/__init__.py +24 -0
  37. extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/__init__.py +24 -0
  38. extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas.py +379 -0
  39. extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/.gitignore +110 -0
  40. extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/Dockerfile +29 -0
  41. extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/LICENSE +21 -0
  42. extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/README.md +259 -0
  43. extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/environment.yaml +16 -0
  44. extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/hubconf.py +435 -0
  45. extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/input/.placeholder +0 -0
  46. extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/beit.py +196 -0
  47. extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/levit.py +106 -0
  48. extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/next_vit.py +39 -0
  49. extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/swin.py +13 -0
  50. extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/swin2.py +34 -0
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/deeplabv3_unet_s5-d16.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ pretrained=None,
6
+ backbone=dict(
7
+ type='UNet',
8
+ in_channels=3,
9
+ base_channels=64,
10
+ num_stages=5,
11
+ strides=(1, 1, 1, 1, 1),
12
+ enc_num_convs=(2, 2, 2, 2, 2),
13
+ dec_num_convs=(2, 2, 2, 2),
14
+ downsamples=(True, True, True, True),
15
+ enc_dilations=(1, 1, 1, 1, 1),
16
+ dec_dilations=(1, 1, 1, 1),
17
+ with_cp=False,
18
+ conv_cfg=None,
19
+ norm_cfg=norm_cfg,
20
+ act_cfg=dict(type='ReLU'),
21
+ upsample_cfg=dict(type='InterpConv'),
22
+ norm_eval=False),
23
+ decode_head=dict(
24
+ type='ASPPHead',
25
+ in_channels=64,
26
+ in_index=4,
27
+ channels=16,
28
+ dilations=(1, 12, 24, 36),
29
+ dropout_ratio=0.1,
30
+ num_classes=2,
31
+ norm_cfg=norm_cfg,
32
+ align_corners=False,
33
+ loss_decode=dict(
34
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
35
+ auxiliary_head=dict(
36
+ type='FCNHead',
37
+ in_channels=128,
38
+ in_index=3,
39
+ channels=64,
40
+ num_convs=1,
41
+ concat_input=False,
42
+ dropout_ratio=0.1,
43
+ num_classes=2,
44
+ norm_cfg=norm_cfg,
45
+ align_corners=False,
46
+ loss_decode=dict(
47
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
48
+ # model training and testing settings
49
+ train_cfg=dict(),
50
+ test_cfg=dict(mode='slide', crop_size=256, stride=170))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/deeplabv3plus_r50-d8.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ pretrained='open-mmlab://resnet50_v1c',
6
+ backbone=dict(
7
+ type='ResNetV1c',
8
+ depth=50,
9
+ num_stages=4,
10
+ out_indices=(0, 1, 2, 3),
11
+ dilations=(1, 1, 2, 4),
12
+ strides=(1, 2, 1, 1),
13
+ norm_cfg=norm_cfg,
14
+ norm_eval=False,
15
+ style='pytorch',
16
+ contract_dilation=True),
17
+ decode_head=dict(
18
+ type='DepthwiseSeparableASPPHead',
19
+ in_channels=2048,
20
+ in_index=3,
21
+ channels=512,
22
+ dilations=(1, 12, 24, 36),
23
+ c1_in_channels=256,
24
+ c1_channels=48,
25
+ dropout_ratio=0.1,
26
+ num_classes=19,
27
+ norm_cfg=norm_cfg,
28
+ align_corners=False,
29
+ loss_decode=dict(
30
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
31
+ auxiliary_head=dict(
32
+ type='FCNHead',
33
+ in_channels=1024,
34
+ in_index=2,
35
+ channels=256,
36
+ num_convs=1,
37
+ concat_input=False,
38
+ dropout_ratio=0.1,
39
+ num_classes=19,
40
+ norm_cfg=norm_cfg,
41
+ align_corners=False,
42
+ loss_decode=dict(
43
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
44
+ # model training and testing settings
45
+ train_cfg=dict(),
46
+ test_cfg=dict(mode='whole'))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/dmnet_r50-d8.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ pretrained='open-mmlab://resnet50_v1c',
6
+ backbone=dict(
7
+ type='ResNetV1c',
8
+ depth=50,
9
+ num_stages=4,
10
+ out_indices=(0, 1, 2, 3),
11
+ dilations=(1, 1, 2, 4),
12
+ strides=(1, 2, 1, 1),
13
+ norm_cfg=norm_cfg,
14
+ norm_eval=False,
15
+ style='pytorch',
16
+ contract_dilation=True),
17
+ decode_head=dict(
18
+ type='DMHead',
19
+ in_channels=2048,
20
+ in_index=3,
21
+ channels=512,
22
+ filter_sizes=(1, 3, 5, 7),
23
+ dropout_ratio=0.1,
24
+ num_classes=19,
25
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
26
+ align_corners=False,
27
+ loss_decode=dict(
28
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
29
+ auxiliary_head=dict(
30
+ type='FCNHead',
31
+ in_channels=1024,
32
+ in_index=2,
33
+ channels=256,
34
+ num_convs=1,
35
+ concat_input=False,
36
+ dropout_ratio=0.1,
37
+ num_classes=19,
38
+ norm_cfg=norm_cfg,
39
+ align_corners=False,
40
+ loss_decode=dict(
41
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
42
+ # model training and testing settings
43
+ train_cfg=dict(),
44
+ test_cfg=dict(mode='whole'))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/dnl_r50-d8.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ pretrained='open-mmlab://resnet50_v1c',
6
+ backbone=dict(
7
+ type='ResNetV1c',
8
+ depth=50,
9
+ num_stages=4,
10
+ out_indices=(0, 1, 2, 3),
11
+ dilations=(1, 1, 2, 4),
12
+ strides=(1, 2, 1, 1),
13
+ norm_cfg=norm_cfg,
14
+ norm_eval=False,
15
+ style='pytorch',
16
+ contract_dilation=True),
17
+ decode_head=dict(
18
+ type='DNLHead',
19
+ in_channels=2048,
20
+ in_index=3,
21
+ channels=512,
22
+ dropout_ratio=0.1,
23
+ reduction=2,
24
+ use_scale=True,
25
+ mode='embedded_gaussian',
26
+ num_classes=19,
27
+ norm_cfg=norm_cfg,
28
+ align_corners=False,
29
+ loss_decode=dict(
30
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
31
+ auxiliary_head=dict(
32
+ type='FCNHead',
33
+ in_channels=1024,
34
+ in_index=2,
35
+ channels=256,
36
+ num_convs=1,
37
+ concat_input=False,
38
+ dropout_ratio=0.1,
39
+ num_classes=19,
40
+ norm_cfg=norm_cfg,
41
+ align_corners=False,
42
+ loss_decode=dict(
43
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
44
+ # model training and testing settings
45
+ train_cfg=dict(),
46
+ test_cfg=dict(mode='whole'))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/emanet_r50-d8.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ pretrained='open-mmlab://resnet50_v1c',
6
+ backbone=dict(
7
+ type='ResNetV1c',
8
+ depth=50,
9
+ num_stages=4,
10
+ out_indices=(0, 1, 2, 3),
11
+ dilations=(1, 1, 2, 4),
12
+ strides=(1, 2, 1, 1),
13
+ norm_cfg=norm_cfg,
14
+ norm_eval=False,
15
+ style='pytorch',
16
+ contract_dilation=True),
17
+ decode_head=dict(
18
+ type='EMAHead',
19
+ in_channels=2048,
20
+ in_index=3,
21
+ channels=256,
22
+ ema_channels=512,
23
+ num_bases=64,
24
+ num_stages=3,
25
+ momentum=0.1,
26
+ dropout_ratio=0.1,
27
+ num_classes=19,
28
+ norm_cfg=norm_cfg,
29
+ align_corners=False,
30
+ loss_decode=dict(
31
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
32
+ auxiliary_head=dict(
33
+ type='FCNHead',
34
+ in_channels=1024,
35
+ in_index=2,
36
+ channels=256,
37
+ num_convs=1,
38
+ concat_input=False,
39
+ dropout_ratio=0.1,
40
+ num_classes=19,
41
+ norm_cfg=norm_cfg,
42
+ align_corners=False,
43
+ loss_decode=dict(
44
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
45
+ # model training and testing settings
46
+ train_cfg=dict(),
47
+ test_cfg=dict(mode='whole'))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/encnet_r50-d8.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ pretrained='open-mmlab://resnet50_v1c',
6
+ backbone=dict(
7
+ type='ResNetV1c',
8
+ depth=50,
9
+ num_stages=4,
10
+ out_indices=(0, 1, 2, 3),
11
+ dilations=(1, 1, 2, 4),
12
+ strides=(1, 2, 1, 1),
13
+ norm_cfg=norm_cfg,
14
+ norm_eval=False,
15
+ style='pytorch',
16
+ contract_dilation=True),
17
+ decode_head=dict(
18
+ type='EncHead',
19
+ in_channels=[512, 1024, 2048],
20
+ in_index=(1, 2, 3),
21
+ channels=512,
22
+ num_codes=32,
23
+ use_se_loss=True,
24
+ add_lateral=False,
25
+ dropout_ratio=0.1,
26
+ num_classes=19,
27
+ norm_cfg=norm_cfg,
28
+ align_corners=False,
29
+ loss_decode=dict(
30
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
31
+ loss_se_decode=dict(
32
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.2)),
33
+ auxiliary_head=dict(
34
+ type='FCNHead',
35
+ in_channels=1024,
36
+ in_index=2,
37
+ channels=256,
38
+ num_convs=1,
39
+ concat_input=False,
40
+ dropout_ratio=0.1,
41
+ num_classes=19,
42
+ norm_cfg=norm_cfg,
43
+ align_corners=False,
44
+ loss_decode=dict(
45
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
46
+ # model training and testing settings
47
+ train_cfg=dict(),
48
+ test_cfg=dict(mode='whole'))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/fast_scnn.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True, momentum=0.01)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ backbone=dict(
6
+ type='FastSCNN',
7
+ downsample_dw_channels=(32, 48),
8
+ global_in_channels=64,
9
+ global_block_channels=(64, 96, 128),
10
+ global_block_strides=(2, 2, 1),
11
+ global_out_channels=128,
12
+ higher_in_channels=64,
13
+ lower_in_channels=128,
14
+ fusion_out_channels=128,
15
+ out_indices=(0, 1, 2),
16
+ norm_cfg=norm_cfg,
17
+ align_corners=False),
18
+ decode_head=dict(
19
+ type='DepthwiseSeparableFCNHead',
20
+ in_channels=128,
21
+ channels=128,
22
+ concat_input=False,
23
+ num_classes=19,
24
+ in_index=-1,
25
+ norm_cfg=norm_cfg,
26
+ align_corners=False,
27
+ loss_decode=dict(
28
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)),
29
+ auxiliary_head=[
30
+ dict(
31
+ type='FCNHead',
32
+ in_channels=128,
33
+ channels=32,
34
+ num_convs=1,
35
+ num_classes=19,
36
+ in_index=-2,
37
+ norm_cfg=norm_cfg,
38
+ concat_input=False,
39
+ align_corners=False,
40
+ loss_decode=dict(
41
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)),
42
+ dict(
43
+ type='FCNHead',
44
+ in_channels=64,
45
+ channels=32,
46
+ num_convs=1,
47
+ num_classes=19,
48
+ in_index=-3,
49
+ norm_cfg=norm_cfg,
50
+ concat_input=False,
51
+ align_corners=False,
52
+ loss_decode=dict(
53
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)),
54
+ ],
55
+ # model training and testing settings
56
+ train_cfg=dict(),
57
+ test_cfg=dict(mode='whole'))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/fcn_hr18.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ pretrained='open-mmlab://msra/hrnetv2_w18',
6
+ backbone=dict(
7
+ type='HRNet',
8
+ norm_cfg=norm_cfg,
9
+ norm_eval=False,
10
+ extra=dict(
11
+ stage1=dict(
12
+ num_modules=1,
13
+ num_branches=1,
14
+ block='BOTTLENECK',
15
+ num_blocks=(4, ),
16
+ num_channels=(64, )),
17
+ stage2=dict(
18
+ num_modules=1,
19
+ num_branches=2,
20
+ block='BASIC',
21
+ num_blocks=(4, 4),
22
+ num_channels=(18, 36)),
23
+ stage3=dict(
24
+ num_modules=4,
25
+ num_branches=3,
26
+ block='BASIC',
27
+ num_blocks=(4, 4, 4),
28
+ num_channels=(18, 36, 72)),
29
+ stage4=dict(
30
+ num_modules=3,
31
+ num_branches=4,
32
+ block='BASIC',
33
+ num_blocks=(4, 4, 4, 4),
34
+ num_channels=(18, 36, 72, 144)))),
35
+ decode_head=dict(
36
+ type='FCNHead',
37
+ in_channels=[18, 36, 72, 144],
38
+ in_index=(0, 1, 2, 3),
39
+ channels=sum([18, 36, 72, 144]),
40
+ input_transform='resize_concat',
41
+ kernel_size=1,
42
+ num_convs=1,
43
+ concat_input=False,
44
+ dropout_ratio=-1,
45
+ num_classes=19,
46
+ norm_cfg=norm_cfg,
47
+ align_corners=False,
48
+ loss_decode=dict(
49
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
50
+ # model training and testing settings
51
+ train_cfg=dict(),
52
+ test_cfg=dict(mode='whole'))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/fcn_r50-d8.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ pretrained='open-mmlab://resnet50_v1c',
6
+ backbone=dict(
7
+ type='ResNetV1c',
8
+ depth=50,
9
+ num_stages=4,
10
+ out_indices=(0, 1, 2, 3),
11
+ dilations=(1, 1, 2, 4),
12
+ strides=(1, 2, 1, 1),
13
+ norm_cfg=norm_cfg,
14
+ norm_eval=False,
15
+ style='pytorch',
16
+ contract_dilation=True),
17
+ decode_head=dict(
18
+ type='FCNHead',
19
+ in_channels=2048,
20
+ in_index=3,
21
+ channels=512,
22
+ num_convs=2,
23
+ concat_input=True,
24
+ dropout_ratio=0.1,
25
+ num_classes=19,
26
+ norm_cfg=norm_cfg,
27
+ align_corners=False,
28
+ loss_decode=dict(
29
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
30
+ auxiliary_head=dict(
31
+ type='FCNHead',
32
+ in_channels=1024,
33
+ in_index=2,
34
+ channels=256,
35
+ num_convs=1,
36
+ concat_input=False,
37
+ dropout_ratio=0.1,
38
+ num_classes=19,
39
+ norm_cfg=norm_cfg,
40
+ align_corners=False,
41
+ loss_decode=dict(
42
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
43
+ # model training and testing settings
44
+ train_cfg=dict(),
45
+ test_cfg=dict(mode='whole'))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/fcn_unet_s5-d16.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ pretrained=None,
6
+ backbone=dict(
7
+ type='UNet',
8
+ in_channels=3,
9
+ base_channels=64,
10
+ num_stages=5,
11
+ strides=(1, 1, 1, 1, 1),
12
+ enc_num_convs=(2, 2, 2, 2, 2),
13
+ dec_num_convs=(2, 2, 2, 2),
14
+ downsamples=(True, True, True, True),
15
+ enc_dilations=(1, 1, 1, 1, 1),
16
+ dec_dilations=(1, 1, 1, 1),
17
+ with_cp=False,
18
+ conv_cfg=None,
19
+ norm_cfg=norm_cfg,
20
+ act_cfg=dict(type='ReLU'),
21
+ upsample_cfg=dict(type='InterpConv'),
22
+ norm_eval=False),
23
+ decode_head=dict(
24
+ type='FCNHead',
25
+ in_channels=64,
26
+ in_index=4,
27
+ channels=64,
28
+ num_convs=1,
29
+ concat_input=False,
30
+ dropout_ratio=0.1,
31
+ num_classes=2,
32
+ norm_cfg=norm_cfg,
33
+ align_corners=False,
34
+ loss_decode=dict(
35
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
36
+ auxiliary_head=dict(
37
+ type='FCNHead',
38
+ in_channels=128,
39
+ in_index=3,
40
+ channels=64,
41
+ num_convs=1,
42
+ concat_input=False,
43
+ dropout_ratio=0.1,
44
+ num_classes=2,
45
+ norm_cfg=norm_cfg,
46
+ align_corners=False,
47
+ loss_decode=dict(
48
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
49
+ # model training and testing settings
50
+ train_cfg=dict(),
51
+ test_cfg=dict(mode='slide', crop_size=256, stride=170))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/fpn_r50.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ pretrained='open-mmlab://resnet50_v1c',
6
+ backbone=dict(
7
+ type='ResNetV1c',
8
+ depth=50,
9
+ num_stages=4,
10
+ out_indices=(0, 1, 2, 3),
11
+ dilations=(1, 1, 1, 1),
12
+ strides=(1, 2, 2, 2),
13
+ norm_cfg=norm_cfg,
14
+ norm_eval=False,
15
+ style='pytorch',
16
+ contract_dilation=True),
17
+ neck=dict(
18
+ type='FPN',
19
+ in_channels=[256, 512, 1024, 2048],
20
+ out_channels=256,
21
+ num_outs=4),
22
+ decode_head=dict(
23
+ type='FPNHead',
24
+ in_channels=[256, 256, 256, 256],
25
+ in_index=[0, 1, 2, 3],
26
+ feature_strides=[4, 8, 16, 32],
27
+ channels=128,
28
+ dropout_ratio=0.1,
29
+ num_classes=19,
30
+ norm_cfg=norm_cfg,
31
+ align_corners=False,
32
+ loss_decode=dict(
33
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
34
+ # model training and testing settings
35
+ train_cfg=dict(),
36
+ test_cfg=dict(mode='whole'))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/fpn_uniformer.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ backbone=dict(
6
+ type='UniFormer',
7
+ embed_dim=[64, 128, 320, 512],
8
+ layers=[3, 4, 8, 3],
9
+ head_dim=64,
10
+ mlp_ratio=4.,
11
+ qkv_bias=True,
12
+ drop_rate=0.,
13
+ attn_drop_rate=0.,
14
+ drop_path_rate=0.1),
15
+ neck=dict(
16
+ type='FPN',
17
+ in_channels=[64, 128, 320, 512],
18
+ out_channels=256,
19
+ num_outs=4),
20
+ decode_head=dict(
21
+ type='FPNHead',
22
+ in_channels=[256, 256, 256, 256],
23
+ in_index=[0, 1, 2, 3],
24
+ feature_strides=[4, 8, 16, 32],
25
+ channels=128,
26
+ dropout_ratio=0.1,
27
+ num_classes=150,
28
+ norm_cfg=norm_cfg,
29
+ align_corners=False,
30
+ loss_decode=dict(
31
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
32
+ # model training and testing settings
33
+ train_cfg=dict(),
34
+ test_cfg=dict(mode='whole')
35
+ )
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/gcnet_r50-d8.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ pretrained='open-mmlab://resnet50_v1c',
6
+ backbone=dict(
7
+ type='ResNetV1c',
8
+ depth=50,
9
+ num_stages=4,
10
+ out_indices=(0, 1, 2, 3),
11
+ dilations=(1, 1, 2, 4),
12
+ strides=(1, 2, 1, 1),
13
+ norm_cfg=norm_cfg,
14
+ norm_eval=False,
15
+ style='pytorch',
16
+ contract_dilation=True),
17
+ decode_head=dict(
18
+ type='GCHead',
19
+ in_channels=2048,
20
+ in_index=3,
21
+ channels=512,
22
+ ratio=1 / 4.,
23
+ pooling_type='att',
24
+ fusion_types=('channel_add', ),
25
+ dropout_ratio=0.1,
26
+ num_classes=19,
27
+ norm_cfg=norm_cfg,
28
+ align_corners=False,
29
+ loss_decode=dict(
30
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
31
+ auxiliary_head=dict(
32
+ type='FCNHead',
33
+ in_channels=1024,
34
+ in_index=2,
35
+ channels=256,
36
+ num_convs=1,
37
+ concat_input=False,
38
+ dropout_ratio=0.1,
39
+ num_classes=19,
40
+ norm_cfg=norm_cfg,
41
+ align_corners=False,
42
+ loss_decode=dict(
43
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
44
+ # model training and testing settings
45
+ train_cfg=dict(),
46
+ test_cfg=dict(mode='whole'))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/lraspp_m-v3-d8.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', eps=0.001, requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ backbone=dict(
6
+ type='MobileNetV3',
7
+ arch='large',
8
+ out_indices=(1, 3, 16),
9
+ norm_cfg=norm_cfg),
10
+ decode_head=dict(
11
+ type='LRASPPHead',
12
+ in_channels=(16, 24, 960),
13
+ in_index=(0, 1, 2),
14
+ channels=128,
15
+ input_transform='multiple_select',
16
+ dropout_ratio=0.1,
17
+ num_classes=19,
18
+ norm_cfg=norm_cfg,
19
+ act_cfg=dict(type='ReLU'),
20
+ align_corners=False,
21
+ loss_decode=dict(
22
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
23
+ # model training and testing settings
24
+ train_cfg=dict(),
25
+ test_cfg=dict(mode='whole'))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/nonlocal_r50-d8.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ pretrained='open-mmlab://resnet50_v1c',
6
+ backbone=dict(
7
+ type='ResNetV1c',
8
+ depth=50,
9
+ num_stages=4,
10
+ out_indices=(0, 1, 2, 3),
11
+ dilations=(1, 1, 2, 4),
12
+ strides=(1, 2, 1, 1),
13
+ norm_cfg=norm_cfg,
14
+ norm_eval=False,
15
+ style='pytorch',
16
+ contract_dilation=True),
17
+ decode_head=dict(
18
+ type='NLHead',
19
+ in_channels=2048,
20
+ in_index=3,
21
+ channels=512,
22
+ dropout_ratio=0.1,
23
+ reduction=2,
24
+ use_scale=True,
25
+ mode='embedded_gaussian',
26
+ num_classes=19,
27
+ norm_cfg=norm_cfg,
28
+ align_corners=False,
29
+ loss_decode=dict(
30
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
31
+ auxiliary_head=dict(
32
+ type='FCNHead',
33
+ in_channels=1024,
34
+ in_index=2,
35
+ channels=256,
36
+ num_convs=1,
37
+ concat_input=False,
38
+ dropout_ratio=0.1,
39
+ num_classes=19,
40
+ norm_cfg=norm_cfg,
41
+ align_corners=False,
42
+ loss_decode=dict(
43
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
44
+ # model training and testing settings
45
+ train_cfg=dict(),
46
+ test_cfg=dict(mode='whole'))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/ocrnet_hr18.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='CascadeEncoderDecoder',
5
+ num_stages=2,
6
+ pretrained='open-mmlab://msra/hrnetv2_w18',
7
+ backbone=dict(
8
+ type='HRNet',
9
+ norm_cfg=norm_cfg,
10
+ norm_eval=False,
11
+ extra=dict(
12
+ stage1=dict(
13
+ num_modules=1,
14
+ num_branches=1,
15
+ block='BOTTLENECK',
16
+ num_blocks=(4, ),
17
+ num_channels=(64, )),
18
+ stage2=dict(
19
+ num_modules=1,
20
+ num_branches=2,
21
+ block='BASIC',
22
+ num_blocks=(4, 4),
23
+ num_channels=(18, 36)),
24
+ stage3=dict(
25
+ num_modules=4,
26
+ num_branches=3,
27
+ block='BASIC',
28
+ num_blocks=(4, 4, 4),
29
+ num_channels=(18, 36, 72)),
30
+ stage4=dict(
31
+ num_modules=3,
32
+ num_branches=4,
33
+ block='BASIC',
34
+ num_blocks=(4, 4, 4, 4),
35
+ num_channels=(18, 36, 72, 144)))),
36
+ decode_head=[
37
+ dict(
38
+ type='FCNHead',
39
+ in_channels=[18, 36, 72, 144],
40
+ channels=sum([18, 36, 72, 144]),
41
+ in_index=(0, 1, 2, 3),
42
+ input_transform='resize_concat',
43
+ kernel_size=1,
44
+ num_convs=1,
45
+ concat_input=False,
46
+ dropout_ratio=-1,
47
+ num_classes=19,
48
+ norm_cfg=norm_cfg,
49
+ align_corners=False,
50
+ loss_decode=dict(
51
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
52
+ dict(
53
+ type='OCRHead',
54
+ in_channels=[18, 36, 72, 144],
55
+ in_index=(0, 1, 2, 3),
56
+ input_transform='resize_concat',
57
+ channels=512,
58
+ ocr_channels=256,
59
+ dropout_ratio=-1,
60
+ num_classes=19,
61
+ norm_cfg=norm_cfg,
62
+ align_corners=False,
63
+ loss_decode=dict(
64
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
65
+ ],
66
+ # model training and testing settings
67
+ train_cfg=dict(),
68
+ test_cfg=dict(mode='whole'))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/ocrnet_r50-d8.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='CascadeEncoderDecoder',
5
+ num_stages=2,
6
+ pretrained='open-mmlab://resnet50_v1c',
7
+ backbone=dict(
8
+ type='ResNetV1c',
9
+ depth=50,
10
+ num_stages=4,
11
+ out_indices=(0, 1, 2, 3),
12
+ dilations=(1, 1, 2, 4),
13
+ strides=(1, 2, 1, 1),
14
+ norm_cfg=norm_cfg,
15
+ norm_eval=False,
16
+ style='pytorch',
17
+ contract_dilation=True),
18
+ decode_head=[
19
+ dict(
20
+ type='FCNHead',
21
+ in_channels=1024,
22
+ in_index=2,
23
+ channels=256,
24
+ num_convs=1,
25
+ concat_input=False,
26
+ dropout_ratio=0.1,
27
+ num_classes=19,
28
+ norm_cfg=norm_cfg,
29
+ align_corners=False,
30
+ loss_decode=dict(
31
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
32
+ dict(
33
+ type='OCRHead',
34
+ in_channels=2048,
35
+ in_index=3,
36
+ channels=512,
37
+ ocr_channels=256,
38
+ dropout_ratio=0.1,
39
+ num_classes=19,
40
+ norm_cfg=norm_cfg,
41
+ align_corners=False,
42
+ loss_decode=dict(
43
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
44
+ ],
45
+ # model training and testing settings
46
+ train_cfg=dict(),
47
+ test_cfg=dict(mode='whole'))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/pointrend_r50.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='CascadeEncoderDecoder',
5
+ num_stages=2,
6
+ pretrained='open-mmlab://resnet50_v1c',
7
+ backbone=dict(
8
+ type='ResNetV1c',
9
+ depth=50,
10
+ num_stages=4,
11
+ out_indices=(0, 1, 2, 3),
12
+ dilations=(1, 1, 1, 1),
13
+ strides=(1, 2, 2, 2),
14
+ norm_cfg=norm_cfg,
15
+ norm_eval=False,
16
+ style='pytorch',
17
+ contract_dilation=True),
18
+ neck=dict(
19
+ type='FPN',
20
+ in_channels=[256, 512, 1024, 2048],
21
+ out_channels=256,
22
+ num_outs=4),
23
+ decode_head=[
24
+ dict(
25
+ type='FPNHead',
26
+ in_channels=[256, 256, 256, 256],
27
+ in_index=[0, 1, 2, 3],
28
+ feature_strides=[4, 8, 16, 32],
29
+ channels=128,
30
+ dropout_ratio=-1,
31
+ num_classes=19,
32
+ norm_cfg=norm_cfg,
33
+ align_corners=False,
34
+ loss_decode=dict(
35
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
36
+ dict(
37
+ type='PointHead',
38
+ in_channels=[256],
39
+ in_index=[0],
40
+ channels=256,
41
+ num_fcs=3,
42
+ coarse_pred_each_layer=True,
43
+ dropout_ratio=-1,
44
+ num_classes=19,
45
+ align_corners=False,
46
+ loss_decode=dict(
47
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
48
+ ],
49
+ # model training and testing settings
50
+ train_cfg=dict(
51
+ num_points=2048, oversample_ratio=3, importance_sample_ratio=0.75),
52
+ test_cfg=dict(
53
+ mode='whole',
54
+ subdivision_steps=2,
55
+ subdivision_num_points=8196,
56
+ scale_factor=2))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/psanet_r50-d8.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ pretrained='open-mmlab://resnet50_v1c',
6
+ backbone=dict(
7
+ type='ResNetV1c',
8
+ depth=50,
9
+ num_stages=4,
10
+ out_indices=(0, 1, 2, 3),
11
+ dilations=(1, 1, 2, 4),
12
+ strides=(1, 2, 1, 1),
13
+ norm_cfg=norm_cfg,
14
+ norm_eval=False,
15
+ style='pytorch',
16
+ contract_dilation=True),
17
+ decode_head=dict(
18
+ type='PSAHead',
19
+ in_channels=2048,
20
+ in_index=3,
21
+ channels=512,
22
+ mask_size=(97, 97),
23
+ psa_type='bi-direction',
24
+ compact=False,
25
+ shrink_factor=2,
26
+ normalization_factor=1.0,
27
+ psa_softmax=True,
28
+ dropout_ratio=0.1,
29
+ num_classes=19,
30
+ norm_cfg=norm_cfg,
31
+ align_corners=False,
32
+ loss_decode=dict(
33
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
34
+ auxiliary_head=dict(
35
+ type='FCNHead',
36
+ in_channels=1024,
37
+ in_index=2,
38
+ channels=256,
39
+ num_convs=1,
40
+ concat_input=False,
41
+ dropout_ratio=0.1,
42
+ num_classes=19,
43
+ norm_cfg=norm_cfg,
44
+ align_corners=False,
45
+ loss_decode=dict(
46
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
47
+ # model training and testing settings
48
+ train_cfg=dict(),
49
+ test_cfg=dict(mode='whole'))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/pspnet_r50-d8.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ pretrained='open-mmlab://resnet50_v1c',
6
+ backbone=dict(
7
+ type='ResNetV1c',
8
+ depth=50,
9
+ num_stages=4,
10
+ out_indices=(0, 1, 2, 3),
11
+ dilations=(1, 1, 2, 4),
12
+ strides=(1, 2, 1, 1),
13
+ norm_cfg=norm_cfg,
14
+ norm_eval=False,
15
+ style='pytorch',
16
+ contract_dilation=True),
17
+ decode_head=dict(
18
+ type='PSPHead',
19
+ in_channels=2048,
20
+ in_index=3,
21
+ channels=512,
22
+ pool_scales=(1, 2, 3, 6),
23
+ dropout_ratio=0.1,
24
+ num_classes=19,
25
+ norm_cfg=norm_cfg,
26
+ align_corners=False,
27
+ loss_decode=dict(
28
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
29
+ auxiliary_head=dict(
30
+ type='FCNHead',
31
+ in_channels=1024,
32
+ in_index=2,
33
+ channels=256,
34
+ num_convs=1,
35
+ concat_input=False,
36
+ dropout_ratio=0.1,
37
+ num_classes=19,
38
+ norm_cfg=norm_cfg,
39
+ align_corners=False,
40
+ loss_decode=dict(
41
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
42
+ # model training and testing settings
43
+ train_cfg=dict(),
44
+ test_cfg=dict(mode='whole'))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/pspnet_unet_s5-d16.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ pretrained=None,
6
+ backbone=dict(
7
+ type='UNet',
8
+ in_channels=3,
9
+ base_channels=64,
10
+ num_stages=5,
11
+ strides=(1, 1, 1, 1, 1),
12
+ enc_num_convs=(2, 2, 2, 2, 2),
13
+ dec_num_convs=(2, 2, 2, 2),
14
+ downsamples=(True, True, True, True),
15
+ enc_dilations=(1, 1, 1, 1, 1),
16
+ dec_dilations=(1, 1, 1, 1),
17
+ with_cp=False,
18
+ conv_cfg=None,
19
+ norm_cfg=norm_cfg,
20
+ act_cfg=dict(type='ReLU'),
21
+ upsample_cfg=dict(type='InterpConv'),
22
+ norm_eval=False),
23
+ decode_head=dict(
24
+ type='PSPHead',
25
+ in_channels=64,
26
+ in_index=4,
27
+ channels=16,
28
+ pool_scales=(1, 2, 3, 6),
29
+ dropout_ratio=0.1,
30
+ num_classes=2,
31
+ norm_cfg=norm_cfg,
32
+ align_corners=False,
33
+ loss_decode=dict(
34
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
35
+ auxiliary_head=dict(
36
+ type='FCNHead',
37
+ in_channels=128,
38
+ in_index=3,
39
+ channels=64,
40
+ num_convs=1,
41
+ concat_input=False,
42
+ dropout_ratio=0.1,
43
+ num_classes=2,
44
+ norm_cfg=norm_cfg,
45
+ align_corners=False,
46
+ loss_decode=dict(
47
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
48
+ # model training and testing settings
49
+ train_cfg=dict(),
50
+ test_cfg=dict(mode='slide', crop_size=256, stride=170))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/upernet_r50.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ pretrained='open-mmlab://resnet50_v1c',
6
+ backbone=dict(
7
+ type='ResNetV1c',
8
+ depth=50,
9
+ num_stages=4,
10
+ out_indices=(0, 1, 2, 3),
11
+ dilations=(1, 1, 1, 1),
12
+ strides=(1, 2, 2, 2),
13
+ norm_cfg=norm_cfg,
14
+ norm_eval=False,
15
+ style='pytorch',
16
+ contract_dilation=True),
17
+ decode_head=dict(
18
+ type='UPerHead',
19
+ in_channels=[256, 512, 1024, 2048],
20
+ in_index=[0, 1, 2, 3],
21
+ pool_scales=(1, 2, 3, 6),
22
+ channels=512,
23
+ dropout_ratio=0.1,
24
+ num_classes=19,
25
+ norm_cfg=norm_cfg,
26
+ align_corners=False,
27
+ loss_decode=dict(
28
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
29
+ auxiliary_head=dict(
30
+ type='FCNHead',
31
+ in_channels=1024,
32
+ in_index=2,
33
+ channels=256,
34
+ num_convs=1,
35
+ concat_input=False,
36
+ dropout_ratio=0.1,
37
+ num_classes=19,
38
+ norm_cfg=norm_cfg,
39
+ align_corners=False,
40
+ loss_decode=dict(
41
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
42
+ # model training and testing settings
43
+ train_cfg=dict(),
44
+ test_cfg=dict(mode='whole'))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/upernet_uniformer.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='BN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ pretrained=None,
6
+ backbone=dict(
7
+ type='UniFormer',
8
+ embed_dim=[64, 128, 320, 512],
9
+ layers=[3, 4, 8, 3],
10
+ head_dim=64,
11
+ mlp_ratio=4.,
12
+ qkv_bias=True,
13
+ drop_rate=0.,
14
+ attn_drop_rate=0.,
15
+ drop_path_rate=0.1),
16
+ decode_head=dict(
17
+ type='UPerHead',
18
+ in_channels=[64, 128, 320, 512],
19
+ in_index=[0, 1, 2, 3],
20
+ pool_scales=(1, 2, 3, 6),
21
+ channels=512,
22
+ dropout_ratio=0.1,
23
+ num_classes=19,
24
+ norm_cfg=norm_cfg,
25
+ align_corners=False,
26
+ loss_decode=dict(
27
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
28
+ auxiliary_head=dict(
29
+ type='FCNHead',
30
+ in_channels=320,
31
+ in_index=2,
32
+ channels=256,
33
+ num_convs=1,
34
+ concat_input=False,
35
+ dropout_ratio=0.1,
36
+ num_classes=19,
37
+ norm_cfg=norm_cfg,
38
+ align_corners=False,
39
+ loss_decode=dict(
40
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
41
+ # model training and testing settings
42
+ train_cfg=dict(),
43
+ test_cfg=dict(mode='whole'))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/schedules/schedule_160k.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # optimizer
2
+ optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
3
+ optimizer_config = dict()
4
+ # learning policy
5
+ lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
6
+ # runtime settings
7
+ runner = dict(type='IterBasedRunner', max_iters=160000)
8
+ checkpoint_config = dict(by_epoch=False, interval=16000)
9
+ evaluation = dict(interval=16000, metric='mIoU')
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/schedules/schedule_20k.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # optimizer
2
+ optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
3
+ optimizer_config = dict()
4
+ # learning policy
5
+ lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
6
+ # runtime settings
7
+ runner = dict(type='IterBasedRunner', max_iters=20000)
8
+ checkpoint_config = dict(by_epoch=False, interval=2000)
9
+ evaluation = dict(interval=2000, metric='mIoU')
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/schedules/schedule_40k.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # optimizer
2
+ optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
3
+ optimizer_config = dict()
4
+ # learning policy
5
+ lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
6
+ # runtime settings
7
+ runner = dict(type='IterBasedRunner', max_iters=40000)
8
+ checkpoint_config = dict(by_epoch=False, interval=4000)
9
+ evaluation = dict(interval=4000, metric='mIoU')
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/schedules/schedule_80k.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # optimizer
2
+ optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
3
+ optimizer_config = dict()
4
+ # learning policy
5
+ lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
6
+ # runtime settings
7
+ runner = dict(type='IterBasedRunner', max_iters=80000)
8
+ checkpoint_config = dict(by_epoch=False, interval=8000)
9
+ evaluation = dict(interval=8000, metric='mIoU')
extensions/microsoftexcel-controlnet/annotator/uniformer/inference.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+
4
+ try:
5
+ import mmcv as mmcv
6
+ from mmcv.parallel import collate, scatter
7
+ from mmcv.runner import load_checkpoint
8
+ from mmseg.datasets.pipelines import Compose
9
+ from mmseg.models import build_segmentor
10
+ except ImportError:
11
+ import annotator.mmpkg.mmcv as mmcv
12
+ from annotator.mmpkg.mmcv.parallel import collate, scatter
13
+ from annotator.mmpkg.mmcv.runner import load_checkpoint
14
+ from annotator.mmpkg.mmseg.datasets.pipelines import Compose
15
+ from annotator.mmpkg.mmseg.models import build_segmentor
16
+
17
+ def init_segmentor(config, checkpoint=None, device='cuda:0'):
18
+ """Initialize a segmentor from config file.
19
+
20
+ Args:
21
+ config (str or :obj:`mmcv.Config`): Config file path or the config
22
+ object.
23
+ checkpoint (str, optional): Checkpoint path. If left as None, the model
24
+ will not load any weights.
25
+ device (str, optional) CPU/CUDA device option. Default 'cuda:0'.
26
+ Use 'cpu' for loading model on CPU.
27
+ Returns:
28
+ nn.Module: The constructed segmentor.
29
+ """
30
+ if isinstance(config, str):
31
+ config = mmcv.Config.fromfile(config)
32
+ elif not isinstance(config, mmcv.Config):
33
+ raise TypeError('config must be a filename or Config object, '
34
+ 'but got {}'.format(type(config)))
35
+ config.model.pretrained = None
36
+ config.model.train_cfg = None
37
+ model = build_segmentor(config.model, test_cfg=config.get('test_cfg'))
38
+ if checkpoint is not None:
39
+ checkpoint = load_checkpoint(model, checkpoint, map_location='cpu')
40
+ model.CLASSES = checkpoint['meta']['CLASSES']
41
+ model.PALETTE = checkpoint['meta']['PALETTE']
42
+ model.cfg = config # save the config in the model for convenience
43
+ model.to(device)
44
+ model.eval()
45
+ return model
46
+
47
+
48
+ class LoadImage:
49
+ """A simple pipeline to load image."""
50
+
51
+ def __call__(self, results):
52
+ """Call function to load images into results.
53
+
54
+ Args:
55
+ results (dict): A result dict contains the file name
56
+ of the image to be read.
57
+
58
+ Returns:
59
+ dict: ``results`` will be returned containing loaded image.
60
+ """
61
+
62
+ if isinstance(results['img'], str):
63
+ results['filename'] = results['img']
64
+ results['ori_filename'] = results['img']
65
+ else:
66
+ results['filename'] = None
67
+ results['ori_filename'] = None
68
+ img = mmcv.imread(results['img'])
69
+ results['img'] = img
70
+ results['img_shape'] = img.shape
71
+ results['ori_shape'] = img.shape
72
+ return results
73
+
74
+
75
+ def inference_segmentor(model, img):
76
+ """Inference image(s) with the segmentor.
77
+
78
+ Args:
79
+ model (nn.Module): The loaded segmentor.
80
+ imgs (str/ndarray or list[str/ndarray]): Either image files or loaded
81
+ images.
82
+
83
+ Returns:
84
+ (list[Tensor]): The segmentation result.
85
+ """
86
+ cfg = model.cfg
87
+ device = next(model.parameters()).device # model device
88
+ # build the data pipeline
89
+ test_pipeline = [LoadImage()] + cfg.data.test.pipeline[1:]
90
+ test_pipeline = Compose(test_pipeline)
91
+ # prepare data
92
+ data = dict(img=img)
93
+ data = test_pipeline(data)
94
+ data = collate([data], samples_per_gpu=1)
95
+ if next(model.parameters()).is_cuda:
96
+ # scatter to specified GPU
97
+ data = scatter(data, [device])[0]
98
+ else:
99
+ data['img_metas'] = [i.data[0] for i in data['img_metas']]
100
+
101
+ data['img'] = [x.to(device) for x in data['img']]
102
+
103
+ # forward the model
104
+ with torch.no_grad():
105
+ result = model(return_loss=False, rescale=True, **data)
106
+ return result
107
+
108
+
109
+ def show_result_pyplot(model,
110
+ img,
111
+ result,
112
+ palette=None,
113
+ fig_size=(15, 10),
114
+ opacity=0.5,
115
+ title='',
116
+ block=True):
117
+ """Visualize the segmentation results on the image.
118
+
119
+ Args:
120
+ model (nn.Module): The loaded segmentor.
121
+ img (str or np.ndarray): Image filename or loaded image.
122
+ result (list): The segmentation result.
123
+ palette (list[list[int]]] | None): The palette of segmentation
124
+ map. If None is given, random palette will be generated.
125
+ Default: None
126
+ fig_size (tuple): Figure size of the pyplot figure.
127
+ opacity(float): Opacity of painted segmentation map.
128
+ Default 0.5.
129
+ Must be in (0, 1] range.
130
+ title (str): The title of pyplot figure.
131
+ Default is ''.
132
+ block (bool): Whether to block the pyplot figure.
133
+ Default is True.
134
+ """
135
+ if hasattr(model, 'module'):
136
+ model = model.module
137
+ img = model.show_result(
138
+ img, result, palette=palette, show=False, opacity=opacity)
139
+ # plt.figure(figsize=fig_size)
140
+ # plt.imshow(mmcv.bgr2rgb(img))
141
+ # plt.title(title)
142
+ # plt.tight_layout()
143
+ # plt.show(block=block)
144
+ return mmcv.bgr2rgb(img)
extensions/microsoftexcel-controlnet/annotator/uniformer/mmcv_custom/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from .checkpoint import load_checkpoint
4
+
5
+ __all__ = ['load_checkpoint']
extensions/microsoftexcel-controlnet/annotator/uniformer/mmcv_custom/checkpoint.py ADDED
@@ -0,0 +1,508 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Open-MMLab. All rights reserved.
2
+ import io
3
+ import os
4
+ import os.path as osp
5
+ import pkgutil
6
+ import time
7
+ import warnings
8
+ from collections import OrderedDict
9
+ from importlib import import_module
10
+ from tempfile import TemporaryDirectory
11
+
12
+ import torch
13
+ import torchvision
14
+ from torch.optim import Optimizer
15
+ from torch.utils import model_zoo
16
+ from torch.nn import functional as F
17
+
18
+ try:
19
+ import mmcv as mmcv
20
+ from mmcv.fileio import FileClient
21
+ from mmcv.fileio import load as load_file
22
+ from mmcv.parallel import is_module_wrapper
23
+ from mmcv.utils import mkdir_or_exist
24
+ from mmcv.runner import get_dist_info
25
+ except ImportError:
26
+ import annotator.mmpkg.mmcv as mmcv
27
+ from annotator.mmpkg.mmcv.fileio import FileClient
28
+ from annotator.mmpkg.mmcv.fileio import load as load_file
29
+ from annotator.mmpkg.mmcv.parallel import is_module_wrapper
30
+ from annotator.mmpkg.mmcv.utils import mkdir_or_exist
31
+ from annotator.mmpkg.mmcv.runner import get_dist_info
32
+
33
+ ENV_MMCV_HOME = 'MMCV_HOME'
34
+ ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME'
35
+ DEFAULT_CACHE_DIR = '~/.cache'
36
+
37
+
38
+ def _get_mmcv_home():
39
+ mmcv_home = os.path.expanduser(
40
+ os.getenv(
41
+ ENV_MMCV_HOME,
42
+ os.path.join(
43
+ os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), 'mmcv')))
44
+
45
+ mkdir_or_exist(mmcv_home)
46
+ return mmcv_home
47
+
48
+
49
+ def load_state_dict(module, state_dict, strict=False, logger=None):
50
+ """Load state_dict to a module.
51
+
52
+ This method is modified from :meth:`torch.nn.Module.load_state_dict`.
53
+ Default value for ``strict`` is set to ``False`` and the message for
54
+ param mismatch will be shown even if strict is False.
55
+
56
+ Args:
57
+ module (Module): Module that receives the state_dict.
58
+ state_dict (OrderedDict): Weights.
59
+ strict (bool): whether to strictly enforce that the keys
60
+ in :attr:`state_dict` match the keys returned by this module's
61
+ :meth:`~torch.nn.Module.state_dict` function. Default: ``False``.
62
+ logger (:obj:`logging.Logger`, optional): Logger to log the error
63
+ message. If not specified, print function will be used.
64
+ """
65
+ unexpected_keys = []
66
+ all_missing_keys = []
67
+ err_msg = []
68
+
69
+ metadata = getattr(state_dict, '_metadata', None)
70
+ state_dict = state_dict.copy()
71
+ if metadata is not None:
72
+ state_dict._metadata = metadata
73
+
74
+ # use _load_from_state_dict to enable checkpoint version control
75
+ def load(module, prefix=''):
76
+ # recursively check parallel module in case that the model has a
77
+ # complicated structure, e.g., nn.Module(nn.Module(DDP))
78
+ if is_module_wrapper(module):
79
+ module = module.module
80
+ local_metadata = {} if metadata is None else metadata.get(
81
+ prefix[:-1], {})
82
+ module._load_from_state_dict(state_dict, prefix, local_metadata, True,
83
+ all_missing_keys, unexpected_keys,
84
+ err_msg)
85
+ for name, child in module._modules.items():
86
+ if child is not None:
87
+ load(child, prefix + name + '.')
88
+
89
+ load(module)
90
+ load = None # break load->load reference cycle
91
+
92
+ # ignore "num_batches_tracked" of BN layers
93
+ missing_keys = [
94
+ key for key in all_missing_keys if 'num_batches_tracked' not in key
95
+ ]
96
+
97
+ if unexpected_keys:
98
+ err_msg.append('unexpected key in source '
99
+ f'state_dict: {", ".join(unexpected_keys)}\n')
100
+ if missing_keys:
101
+ err_msg.append(
102
+ f'missing keys in source state_dict: {", ".join(missing_keys)}\n')
103
+
104
+ rank, _ = get_dist_info()
105
+ if len(err_msg) > 0 and rank == 0:
106
+ err_msg.insert(
107
+ 0, 'The model and loaded state dict do not match exactly\n')
108
+ err_msg = '\n'.join(err_msg)
109
+ if strict:
110
+ raise RuntimeError(err_msg)
111
+ elif logger is not None:
112
+ logger.warning(err_msg)
113
+ else:
114
+ print(err_msg)
115
+
116
+
117
+ def load_url_dist(url, model_dir=None):
118
+ """In distributed setting, this function only download checkpoint at local
119
+ rank 0."""
120
+ rank, world_size = get_dist_info()
121
+ rank = int(os.environ.get('LOCAL_RANK', rank))
122
+ if rank == 0:
123
+ checkpoint = model_zoo.load_url(url, model_dir=model_dir)
124
+ if world_size > 1:
125
+ torch.distributed.barrier()
126
+ if rank > 0:
127
+ checkpoint = model_zoo.load_url(url, model_dir=model_dir)
128
+ return checkpoint
129
+
130
+
131
+ def load_pavimodel_dist(model_path, map_location=None):
132
+ """In distributed setting, this function only download checkpoint at local
133
+ rank 0."""
134
+ try:
135
+ from pavi import modelcloud
136
+ except ImportError:
137
+ raise ImportError(
138
+ 'Please install pavi to load checkpoint from modelcloud.')
139
+ rank, world_size = get_dist_info()
140
+ rank = int(os.environ.get('LOCAL_RANK', rank))
141
+ if rank == 0:
142
+ model = modelcloud.get(model_path)
143
+ with TemporaryDirectory() as tmp_dir:
144
+ downloaded_file = osp.join(tmp_dir, model.name)
145
+ model.download(downloaded_file)
146
+ checkpoint = torch.load(downloaded_file, map_location=map_location)
147
+ if world_size > 1:
148
+ torch.distributed.barrier()
149
+ if rank > 0:
150
+ model = modelcloud.get(model_path)
151
+ with TemporaryDirectory() as tmp_dir:
152
+ downloaded_file = osp.join(tmp_dir, model.name)
153
+ model.download(downloaded_file)
154
+ checkpoint = torch.load(
155
+ downloaded_file, map_location=map_location)
156
+ return checkpoint
157
+
158
+
159
+ def load_fileclient_dist(filename, backend, map_location):
160
+ """In distributed setting, this function only download checkpoint at local
161
+ rank 0."""
162
+ rank, world_size = get_dist_info()
163
+ rank = int(os.environ.get('LOCAL_RANK', rank))
164
+ allowed_backends = ['ceph']
165
+ if backend not in allowed_backends:
166
+ raise ValueError(f'Load from Backend {backend} is not supported.')
167
+ if rank == 0:
168
+ fileclient = FileClient(backend=backend)
169
+ buffer = io.BytesIO(fileclient.get(filename))
170
+ checkpoint = torch.load(buffer, map_location=map_location)
171
+ if world_size > 1:
172
+ torch.distributed.barrier()
173
+ if rank > 0:
174
+ fileclient = FileClient(backend=backend)
175
+ buffer = io.BytesIO(fileclient.get(filename))
176
+ checkpoint = torch.load(buffer, map_location=map_location)
177
+ return checkpoint
178
+
179
+
180
+ def get_torchvision_models():
181
+ model_urls = dict()
182
+ for _, name, ispkg in pkgutil.walk_packages(torchvision.models.__path__):
183
+ if ispkg:
184
+ continue
185
+ _zoo = import_module(f'torchvision.models.{name}')
186
+ if hasattr(_zoo, 'model_urls'):
187
+ _urls = getattr(_zoo, 'model_urls')
188
+ model_urls.update(_urls)
189
+ return model_urls
190
+
191
+
192
+ def get_external_models():
193
+ mmcv_home = _get_mmcv_home()
194
+ default_json_path = osp.join(mmcv.__path__[0], 'model_zoo/open_mmlab.json')
195
+ default_urls = load_file(default_json_path)
196
+ assert isinstance(default_urls, dict)
197
+ external_json_path = osp.join(mmcv_home, 'open_mmlab.json')
198
+ if osp.exists(external_json_path):
199
+ external_urls = load_file(external_json_path)
200
+ assert isinstance(external_urls, dict)
201
+ default_urls.update(external_urls)
202
+
203
+ return default_urls
204
+
205
+
206
+ def get_mmcls_models():
207
+ mmcls_json_path = osp.join(mmcv.__path__[0], 'model_zoo/mmcls.json')
208
+ mmcls_urls = load_file(mmcls_json_path)
209
+
210
+ return mmcls_urls
211
+
212
+
213
+ def get_deprecated_model_names():
214
+ deprecate_json_path = osp.join(mmcv.__path__[0],
215
+ 'model_zoo/deprecated.json')
216
+ deprecate_urls = load_file(deprecate_json_path)
217
+ assert isinstance(deprecate_urls, dict)
218
+
219
+ return deprecate_urls
220
+
221
+
222
+ def _process_mmcls_checkpoint(checkpoint):
223
+ state_dict = checkpoint['state_dict']
224
+ new_state_dict = OrderedDict()
225
+ for k, v in state_dict.items():
226
+ if k.startswith('backbone.'):
227
+ new_state_dict[k[9:]] = v
228
+ new_checkpoint = dict(state_dict=new_state_dict)
229
+
230
+ return new_checkpoint
231
+
232
+
233
+ def _load_checkpoint(filename, map_location=None):
234
+ """Load checkpoint from somewhere (modelzoo, file, url).
235
+
236
+ Args:
237
+ filename (str): Accept local filepath, URL, ``torchvision://xxx``,
238
+ ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
239
+ details.
240
+ map_location (str | None): Same as :func:`torch.load`. Default: None.
241
+
242
+ Returns:
243
+ dict | OrderedDict: The loaded checkpoint. It can be either an
244
+ OrderedDict storing model weights or a dict containing other
245
+ information, which depends on the checkpoint.
246
+ """
247
+ if filename.startswith('modelzoo://'):
248
+ warnings.warn('The URL scheme of "modelzoo://" is deprecated, please '
249
+ 'use "torchvision://" instead')
250
+ model_urls = get_torchvision_models()
251
+ model_name = filename[11:]
252
+ checkpoint = load_url_dist(model_urls[model_name])
253
+ elif filename.startswith('torchvision://'):
254
+ model_urls = get_torchvision_models()
255
+ model_name = filename[14:]
256
+ checkpoint = load_url_dist(model_urls[model_name])
257
+ elif filename.startswith('open-mmlab://'):
258
+ model_urls = get_external_models()
259
+ model_name = filename[13:]
260
+ deprecated_urls = get_deprecated_model_names()
261
+ if model_name in deprecated_urls:
262
+ warnings.warn(f'open-mmlab://{model_name} is deprecated in favor '
263
+ f'of open-mmlab://{deprecated_urls[model_name]}')
264
+ model_name = deprecated_urls[model_name]
265
+ model_url = model_urls[model_name]
266
+ # check if is url
267
+ if model_url.startswith(('http://', 'https://')):
268
+ checkpoint = load_url_dist(model_url)
269
+ else:
270
+ filename = osp.join(_get_mmcv_home(), model_url)
271
+ if not osp.isfile(filename):
272
+ raise IOError(f'{filename} is not a checkpoint file')
273
+ checkpoint = torch.load(filename, map_location=map_location)
274
+ elif filename.startswith('mmcls://'):
275
+ model_urls = get_mmcls_models()
276
+ model_name = filename[8:]
277
+ checkpoint = load_url_dist(model_urls[model_name])
278
+ checkpoint = _process_mmcls_checkpoint(checkpoint)
279
+ elif filename.startswith(('http://', 'https://')):
280
+ checkpoint = load_url_dist(filename)
281
+ elif filename.startswith('pavi://'):
282
+ model_path = filename[7:]
283
+ checkpoint = load_pavimodel_dist(model_path, map_location=map_location)
284
+ elif filename.startswith('s3://'):
285
+ checkpoint = load_fileclient_dist(
286
+ filename, backend='ceph', map_location=map_location)
287
+ else:
288
+ if not osp.isfile(filename):
289
+ raise IOError(f'{filename} is not a checkpoint file')
290
+ checkpoint = torch.load(filename, map_location=map_location)
291
+ return checkpoint
292
+
293
+
294
+ def load_checkpoint(model,
295
+ filename,
296
+ map_location='cpu',
297
+ strict=False,
298
+ logger=None):
299
+ """Load checkpoint from a file or URI.
300
+
301
+ Args:
302
+ model (Module): Module to load checkpoint.
303
+ filename (str): Accept local filepath, URL, ``torchvision://xxx``,
304
+ ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
305
+ details.
306
+ map_location (str): Same as :func:`torch.load`.
307
+ strict (bool): Whether to allow different params for the model and
308
+ checkpoint.
309
+ logger (:mod:`logging.Logger` or None): The logger for error message.
310
+
311
+ Returns:
312
+ dict or OrderedDict: The loaded checkpoint.
313
+ """
314
+ checkpoint = _load_checkpoint(filename, map_location)
315
+ # OrderedDict is a subclass of dict
316
+ if not isinstance(checkpoint, dict):
317
+ raise RuntimeError(
318
+ f'No state_dict found in checkpoint file {filename}')
319
+ # get state_dict from checkpoint
320
+ if 'state_dict' in checkpoint:
321
+ state_dict = checkpoint['state_dict']
322
+ elif 'model' in checkpoint:
323
+ state_dict = checkpoint['model']
324
+ else:
325
+ state_dict = checkpoint
326
+ # strip prefix of state_dict
327
+ if list(state_dict.keys())[0].startswith('module.'):
328
+ state_dict = {k[7:]: v for k, v in state_dict.items()}
329
+
330
+ # for MoBY, load model of online branch
331
+ if sorted(list(state_dict.keys()))[0].startswith('encoder'):
332
+ state_dict = {k.replace('encoder.', ''): v for k, v in state_dict.items() if k.startswith('encoder.')}
333
+
334
+ # reshape absolute position embedding
335
+ if state_dict.get('absolute_pos_embed') is not None:
336
+ absolute_pos_embed = state_dict['absolute_pos_embed']
337
+ N1, L, C1 = absolute_pos_embed.size()
338
+ N2, C2, H, W = model.absolute_pos_embed.size()
339
+ if N1 != N2 or C1 != C2 or L != H*W:
340
+ logger.warning("Error in loading absolute_pos_embed, pass")
341
+ else:
342
+ state_dict['absolute_pos_embed'] = absolute_pos_embed.view(N2, H, W, C2).permute(0, 3, 1, 2)
343
+
344
+ # interpolate position bias table if needed
345
+ relative_position_bias_table_keys = [k for k in state_dict.keys() if "relative_position_bias_table" in k]
346
+ for table_key in relative_position_bias_table_keys:
347
+ table_pretrained = state_dict[table_key]
348
+ table_current = model.state_dict()[table_key]
349
+ L1, nH1 = table_pretrained.size()
350
+ L2, nH2 = table_current.size()
351
+ if nH1 != nH2:
352
+ logger.warning(f"Error in loading {table_key}, pass")
353
+ else:
354
+ if L1 != L2:
355
+ S1 = int(L1 ** 0.5)
356
+ S2 = int(L2 ** 0.5)
357
+ table_pretrained_resized = F.interpolate(
358
+ table_pretrained.permute(1, 0).view(1, nH1, S1, S1),
359
+ size=(S2, S2), mode='bicubic')
360
+ state_dict[table_key] = table_pretrained_resized.view(nH2, L2).permute(1, 0)
361
+
362
+ # load state_dict
363
+ load_state_dict(model, state_dict, strict, logger)
364
+ return checkpoint
365
+
366
+
367
+ def weights_to_cpu(state_dict):
368
+ """Copy a model state_dict to cpu.
369
+
370
+ Args:
371
+ state_dict (OrderedDict): Model weights on GPU.
372
+
373
+ Returns:
374
+ OrderedDict: Model weights on GPU.
375
+ """
376
+ state_dict_cpu = OrderedDict()
377
+ for key, val in state_dict.items():
378
+ state_dict_cpu[key] = val.cpu()
379
+ return state_dict_cpu
380
+
381
+
382
+ def _save_to_state_dict(module, destination, prefix, keep_vars):
383
+ """Saves module state to `destination` dictionary.
384
+
385
+ This method is modified from :meth:`torch.nn.Module._save_to_state_dict`.
386
+
387
+ Args:
388
+ module (nn.Module): The module to generate state_dict.
389
+ destination (dict): A dict where state will be stored.
390
+ prefix (str): The prefix for parameters and buffers used in this
391
+ module.
392
+ """
393
+ for name, param in module._parameters.items():
394
+ if param is not None:
395
+ destination[prefix + name] = param if keep_vars else param.detach()
396
+ for name, buf in module._buffers.items():
397
+ # remove check of _non_persistent_buffers_set to allow nn.BatchNorm2d
398
+ if buf is not None:
399
+ destination[prefix + name] = buf if keep_vars else buf.detach()
400
+
401
+
402
+ def get_state_dict(module, destination=None, prefix='', keep_vars=False):
403
+ """Returns a dictionary containing a whole state of the module.
404
+
405
+ Both parameters and persistent buffers (e.g. running averages) are
406
+ included. Keys are corresponding parameter and buffer names.
407
+
408
+ This method is modified from :meth:`torch.nn.Module.state_dict` to
409
+ recursively check parallel module in case that the model has a complicated
410
+ structure, e.g., nn.Module(nn.Module(DDP)).
411
+
412
+ Args:
413
+ module (nn.Module): The module to generate state_dict.
414
+ destination (OrderedDict): Returned dict for the state of the
415
+ module.
416
+ prefix (str): Prefix of the key.
417
+ keep_vars (bool): Whether to keep the variable property of the
418
+ parameters. Default: False.
419
+
420
+ Returns:
421
+ dict: A dictionary containing a whole state of the module.
422
+ """
423
+ # recursively check parallel module in case that the model has a
424
+ # complicated structure, e.g., nn.Module(nn.Module(DDP))
425
+ if is_module_wrapper(module):
426
+ module = module.module
427
+
428
+ # below is the same as torch.nn.Module.state_dict()
429
+ if destination is None:
430
+ destination = OrderedDict()
431
+ destination._metadata = OrderedDict()
432
+ destination._metadata[prefix[:-1]] = local_metadata = dict(
433
+ version=module._version)
434
+ _save_to_state_dict(module, destination, prefix, keep_vars)
435
+ for name, child in module._modules.items():
436
+ if child is not None:
437
+ get_state_dict(
438
+ child, destination, prefix + name + '.', keep_vars=keep_vars)
439
+ for hook in module._state_dict_hooks.values():
440
+ hook_result = hook(module, destination, prefix, local_metadata)
441
+ if hook_result is not None:
442
+ destination = hook_result
443
+ return destination
444
+
445
+
446
+ def save_checkpoint(model, filename, optimizer=None, meta=None):
447
+ """Save checkpoint to file.
448
+
449
+ The checkpoint will have 3 fields: ``meta``, ``state_dict`` and
450
+ ``optimizer``. By default ``meta`` will contain version and time info.
451
+
452
+ Args:
453
+ model (Module): Module whose params are to be saved.
454
+ filename (str): Checkpoint filename.
455
+ optimizer (:obj:`Optimizer`, optional): Optimizer to be saved.
456
+ meta (dict, optional): Metadata to be saved in checkpoint.
457
+ """
458
+ if meta is None:
459
+ meta = {}
460
+ elif not isinstance(meta, dict):
461
+ raise TypeError(f'meta must be a dict or None, but got {type(meta)}')
462
+ meta.update(mmcv_version=mmcv.__version__, time=time.asctime())
463
+
464
+ if is_module_wrapper(model):
465
+ model = model.module
466
+
467
+ if hasattr(model, 'CLASSES') and model.CLASSES is not None:
468
+ # save class name to the meta
469
+ meta.update(CLASSES=model.CLASSES)
470
+
471
+ checkpoint = {
472
+ 'meta': meta,
473
+ 'state_dict': weights_to_cpu(get_state_dict(model))
474
+ }
475
+ # save optimizer state dict in the checkpoint
476
+ if isinstance(optimizer, Optimizer):
477
+ checkpoint['optimizer'] = optimizer.state_dict()
478
+ elif isinstance(optimizer, dict):
479
+ checkpoint['optimizer'] = {}
480
+ for name, optim in optimizer.items():
481
+ checkpoint['optimizer'][name] = optim.state_dict()
482
+
483
+ if filename.startswith('pavi://'):
484
+ try:
485
+ from pavi import modelcloud
486
+ from pavi.exception import NodeNotFoundError
487
+ except ImportError:
488
+ raise ImportError(
489
+ 'Please install pavi to load checkpoint from modelcloud.')
490
+ model_path = filename[7:]
491
+ root = modelcloud.Folder()
492
+ model_dir, model_name = osp.split(model_path)
493
+ try:
494
+ model = modelcloud.get(model_dir)
495
+ except NodeNotFoundError:
496
+ model = root.create_training_model(model_dir)
497
+ with TemporaryDirectory() as tmp_dir:
498
+ checkpoint_file = osp.join(tmp_dir, model_name)
499
+ with open(checkpoint_file, 'wb') as f:
500
+ torch.save(checkpoint, f)
501
+ f.flush()
502
+ model.create_file(checkpoint_file, name=model_name)
503
+ else:
504
+ mmcv.mkdir_or_exist(osp.dirname(filename))
505
+ # immediately flush buffer
506
+ with open(filename, 'wb') as f:
507
+ torch.save(checkpoint, f)
508
+ f.flush()
extensions/microsoftexcel-controlnet/annotator/uniformer/uniformer.py ADDED
@@ -0,0 +1,426 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # UniFormer
3
+ # Copyright (c) 2022 SenseTime X-Lab
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # Written by Kunchang Li
6
+ # --------------------------------------------------------
7
+
8
+
9
+ import torch
10
+ import torch.nn as nn
11
+ import torch.nn.functional as F
12
+ import torch.utils.checkpoint as checkpoint
13
+
14
+ from functools import partial
15
+ from collections import OrderedDict
16
+ from timm.models.layers import DropPath, to_2tuple, trunc_normal_
17
+
18
+ try:
19
+ from mmseg.utils import get_root_logger
20
+ from mmseg.models.builder import BACKBONES
21
+ except ImportError:
22
+ from annotator.mmpkg.mmseg.utils import get_root_logger
23
+ from annotator.mmpkg.mmseg.models.builder import BACKBONES
24
+
25
+ from annotator.uniformer.mmcv_custom import load_checkpoint
26
+
27
+
28
+ class Mlp(nn.Module):
29
+ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
30
+ super().__init__()
31
+ out_features = out_features or in_features
32
+ hidden_features = hidden_features or in_features
33
+ self.fc1 = nn.Linear(in_features, hidden_features)
34
+ self.act = act_layer()
35
+ self.fc2 = nn.Linear(hidden_features, out_features)
36
+ self.drop = nn.Dropout(drop)
37
+
38
+ def forward(self, x):
39
+ x = self.fc1(x)
40
+ x = self.act(x)
41
+ x = self.drop(x)
42
+ x = self.fc2(x)
43
+ x = self.drop(x)
44
+ return x
45
+
46
+
47
+ class CMlp(nn.Module):
48
+ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
49
+ super().__init__()
50
+ out_features = out_features or in_features
51
+ hidden_features = hidden_features or in_features
52
+ self.fc1 = nn.Conv2d(in_features, hidden_features, 1)
53
+ self.act = act_layer()
54
+ self.fc2 = nn.Conv2d(hidden_features, out_features, 1)
55
+ self.drop = nn.Dropout(drop)
56
+
57
+ def forward(self, x):
58
+ x = self.fc1(x)
59
+ x = self.act(x)
60
+ x = self.drop(x)
61
+ x = self.fc2(x)
62
+ x = self.drop(x)
63
+ return x
64
+
65
+
66
+ class CBlock(nn.Module):
67
+ def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
68
+ drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
69
+ super().__init__()
70
+ self.pos_embed = nn.Conv2d(dim, dim, 3, padding=1, groups=dim)
71
+ self.norm1 = nn.BatchNorm2d(dim)
72
+ self.conv1 = nn.Conv2d(dim, dim, 1)
73
+ self.conv2 = nn.Conv2d(dim, dim, 1)
74
+ self.attn = nn.Conv2d(dim, dim, 5, padding=2, groups=dim)
75
+ # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
76
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
77
+ self.norm2 = nn.BatchNorm2d(dim)
78
+ mlp_hidden_dim = int(dim * mlp_ratio)
79
+ self.mlp = CMlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
80
+
81
+ def forward(self, x):
82
+ x = x + self.pos_embed(x)
83
+ x = x + self.drop_path(self.conv2(self.attn(self.conv1(self.norm1(x)))))
84
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
85
+ return x
86
+
87
+
88
+ class Attention(nn.Module):
89
+ def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
90
+ super().__init__()
91
+ self.num_heads = num_heads
92
+ head_dim = dim // num_heads
93
+ # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
94
+ self.scale = qk_scale or head_dim ** -0.5
95
+
96
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
97
+ self.attn_drop = nn.Dropout(attn_drop)
98
+ self.proj = nn.Linear(dim, dim)
99
+ self.proj_drop = nn.Dropout(proj_drop)
100
+
101
+ def forward(self, x):
102
+ B, N, C = x.shape
103
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
104
+ q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
105
+
106
+ attn = (q @ k.transpose(-2, -1)) * self.scale
107
+ attn = attn.softmax(dim=-1)
108
+ attn = self.attn_drop(attn)
109
+
110
+ x = (attn @ v).transpose(1, 2).reshape(B, N, C)
111
+ x = self.proj(x)
112
+ x = self.proj_drop(x)
113
+ return x
114
+
115
+
116
+ class SABlock(nn.Module):
117
+ def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
118
+ drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
119
+ super().__init__()
120
+ self.pos_embed = nn.Conv2d(dim, dim, 3, padding=1, groups=dim)
121
+ self.norm1 = norm_layer(dim)
122
+ self.attn = Attention(
123
+ dim,
124
+ num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
125
+ attn_drop=attn_drop, proj_drop=drop)
126
+ # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
127
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
128
+ self.norm2 = norm_layer(dim)
129
+ mlp_hidden_dim = int(dim * mlp_ratio)
130
+ self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
131
+
132
+ def forward(self, x):
133
+ x = x + self.pos_embed(x)
134
+ B, N, H, W = x.shape
135
+ x = x.flatten(2).transpose(1, 2)
136
+ x = x + self.drop_path(self.attn(self.norm1(x)))
137
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
138
+ x = x.transpose(1, 2).reshape(B, N, H, W)
139
+ return x
140
+
141
+
142
+ def window_partition(x, window_size):
143
+ """
144
+ Args:
145
+ x: (B, H, W, C)
146
+ window_size (int): window size
147
+ Returns:
148
+ windows: (num_windows*B, window_size, window_size, C)
149
+ """
150
+ B, H, W, C = x.shape
151
+ x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
152
+ windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
153
+ return windows
154
+
155
+
156
+ def window_reverse(windows, window_size, H, W):
157
+ """
158
+ Args:
159
+ windows: (num_windows*B, window_size, window_size, C)
160
+ window_size (int): Window size
161
+ H (int): Height of image
162
+ W (int): Width of image
163
+ Returns:
164
+ x: (B, H, W, C)
165
+ """
166
+ B = int(windows.shape[0] / (H * W / window_size / window_size))
167
+ x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
168
+ x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
169
+ return x
170
+
171
+
172
+ class SABlock_Windows(nn.Module):
173
+ def __init__(self, dim, num_heads, window_size=14, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
174
+ drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
175
+ super().__init__()
176
+ self.window_size=window_size
177
+ self.pos_embed = nn.Conv2d(dim, dim, 3, padding=1, groups=dim)
178
+ self.norm1 = norm_layer(dim)
179
+ self.attn = Attention(
180
+ dim,
181
+ num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
182
+ attn_drop=attn_drop, proj_drop=drop)
183
+ # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
184
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
185
+ self.norm2 = norm_layer(dim)
186
+ mlp_hidden_dim = int(dim * mlp_ratio)
187
+ self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
188
+
189
+ def forward(self, x):
190
+ x = x + self.pos_embed(x)
191
+ x = x.permute(0, 2, 3, 1)
192
+ B, H, W, C = x.shape
193
+ shortcut = x
194
+ x = self.norm1(x)
195
+
196
+ pad_l = pad_t = 0
197
+ pad_r = (self.window_size - W % self.window_size) % self.window_size
198
+ pad_b = (self.window_size - H % self.window_size) % self.window_size
199
+ x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
200
+ _, Hp, Wp, _ = x.shape
201
+
202
+ x_windows = window_partition(x, self.window_size) # nW*B, window_size, window_size, C
203
+ x_windows = x_windows.view(-1, self.window_size * self.window_size, C) # nW*B, window_size*window_size, C
204
+
205
+ # W-MSA/SW-MSA
206
+ attn_windows = self.attn(x_windows) # nW*B, window_size*window_size, C
207
+
208
+ # merge windows
209
+ attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
210
+ x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C
211
+
212
+ # reverse cyclic shift
213
+ if pad_r > 0 or pad_b > 0:
214
+ x = x[:, :H, :W, :].contiguous()
215
+
216
+ x = shortcut + self.drop_path(x)
217
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
218
+ x = x.permute(0, 3, 1, 2).reshape(B, C, H, W)
219
+ return x
220
+
221
+
222
+ class PatchEmbed(nn.Module):
223
+ """ Image to Patch Embedding
224
+ """
225
+ def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
226
+ super().__init__()
227
+ img_size = to_2tuple(img_size)
228
+ patch_size = to_2tuple(patch_size)
229
+ num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
230
+ self.img_size = img_size
231
+ self.patch_size = patch_size
232
+ self.num_patches = num_patches
233
+ self.norm = nn.LayerNorm(embed_dim)
234
+ self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
235
+
236
+ def forward(self, x):
237
+ B, _, H, W = x.shape
238
+ x = self.proj(x)
239
+ B, _, H, W = x.shape
240
+ x = x.flatten(2).transpose(1, 2)
241
+ x = self.norm(x)
242
+ x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
243
+ return x
244
+
245
+
246
+ @BACKBONES.register_module()
247
+ class UniFormer(nn.Module):
248
+ """ Vision Transformer
249
+ A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale` -
250
+ https://arxiv.org/abs/2010.11929
251
+ """
252
+ def __init__(self, layers=[3, 4, 8, 3], img_size=224, in_chans=3, num_classes=80, embed_dim=[64, 128, 320, 512],
253
+ head_dim=64, mlp_ratio=4., qkv_bias=True, qk_scale=None, representation_size=None,
254
+ drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=partial(nn.LayerNorm, eps=1e-6),
255
+ pretrained_path=None, use_checkpoint=False, checkpoint_num=[0, 0, 0, 0],
256
+ windows=False, hybrid=False, window_size=14):
257
+ """
258
+ Args:
259
+ layer (list): number of block in each layer
260
+ img_size (int, tuple): input image size
261
+ in_chans (int): number of input channels
262
+ num_classes (int): number of classes for classification head
263
+ embed_dim (int): embedding dimension
264
+ head_dim (int): dimension of attention heads
265
+ mlp_ratio (int): ratio of mlp hidden dim to embedding dim
266
+ qkv_bias (bool): enable bias for qkv if True
267
+ qk_scale (float): override default qk scale of head_dim ** -0.5 if set
268
+ representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set
269
+ drop_rate (float): dropout rate
270
+ attn_drop_rate (float): attention dropout rate
271
+ drop_path_rate (float): stochastic depth rate
272
+ norm_layer (nn.Module): normalization layer
273
+ pretrained_path (str): path of pretrained model
274
+ use_checkpoint (bool): whether use checkpoint
275
+ checkpoint_num (list): index for using checkpoint in every stage
276
+ windows (bool): whether use window MHRA
277
+ hybrid (bool): whether use hybrid MHRA
278
+ window_size (int): size of window (>14)
279
+ """
280
+ super().__init__()
281
+ self.num_classes = num_classes
282
+ self.use_checkpoint = use_checkpoint
283
+ self.checkpoint_num = checkpoint_num
284
+ self.windows = windows
285
+ print(f'Use Checkpoint: {self.use_checkpoint}')
286
+ print(f'Checkpoint Number: {self.checkpoint_num}')
287
+ self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
288
+ norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
289
+
290
+ self.patch_embed1 = PatchEmbed(
291
+ img_size=img_size, patch_size=4, in_chans=in_chans, embed_dim=embed_dim[0])
292
+ self.patch_embed2 = PatchEmbed(
293
+ img_size=img_size // 4, patch_size=2, in_chans=embed_dim[0], embed_dim=embed_dim[1])
294
+ self.patch_embed3 = PatchEmbed(
295
+ img_size=img_size // 8, patch_size=2, in_chans=embed_dim[1], embed_dim=embed_dim[2])
296
+ self.patch_embed4 = PatchEmbed(
297
+ img_size=img_size // 16, patch_size=2, in_chans=embed_dim[2], embed_dim=embed_dim[3])
298
+
299
+ self.pos_drop = nn.Dropout(p=drop_rate)
300
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(layers))] # stochastic depth decay rule
301
+ num_heads = [dim // head_dim for dim in embed_dim]
302
+ self.blocks1 = nn.ModuleList([
303
+ CBlock(
304
+ dim=embed_dim[0], num_heads=num_heads[0], mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
305
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer)
306
+ for i in range(layers[0])])
307
+ self.norm1=norm_layer(embed_dim[0])
308
+ self.blocks2 = nn.ModuleList([
309
+ CBlock(
310
+ dim=embed_dim[1], num_heads=num_heads[1], mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
311
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]], norm_layer=norm_layer)
312
+ for i in range(layers[1])])
313
+ self.norm2 = norm_layer(embed_dim[1])
314
+ if self.windows:
315
+ print('Use local window for all blocks in stage3')
316
+ self.blocks3 = nn.ModuleList([
317
+ SABlock_Windows(
318
+ dim=embed_dim[2], num_heads=num_heads[2], window_size=window_size, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
319
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]+layers[1]], norm_layer=norm_layer)
320
+ for i in range(layers[2])])
321
+ elif hybrid:
322
+ print('Use hybrid window for blocks in stage3')
323
+ block3 = []
324
+ for i in range(layers[2]):
325
+ if (i + 1) % 4 == 0:
326
+ block3.append(SABlock(
327
+ dim=embed_dim[2], num_heads=num_heads[2], mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
328
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]+layers[1]], norm_layer=norm_layer))
329
+ else:
330
+ block3.append(SABlock_Windows(
331
+ dim=embed_dim[2], num_heads=num_heads[2], window_size=window_size, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
332
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]+layers[1]], norm_layer=norm_layer))
333
+ self.blocks3 = nn.ModuleList(block3)
334
+ else:
335
+ print('Use global window for all blocks in stage3')
336
+ self.blocks3 = nn.ModuleList([
337
+ SABlock(
338
+ dim=embed_dim[2], num_heads=num_heads[2], mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
339
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]+layers[1]], norm_layer=norm_layer)
340
+ for i in range(layers[2])])
341
+ self.norm3 = norm_layer(embed_dim[2])
342
+ self.blocks4 = nn.ModuleList([
343
+ SABlock(
344
+ dim=embed_dim[3], num_heads=num_heads[3], mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
345
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]+layers[1]+layers[2]], norm_layer=norm_layer)
346
+ for i in range(layers[3])])
347
+ self.norm4 = norm_layer(embed_dim[3])
348
+
349
+ # Representation layer
350
+ if representation_size:
351
+ self.num_features = representation_size
352
+ self.pre_logits = nn.Sequential(OrderedDict([
353
+ ('fc', nn.Linear(embed_dim, representation_size)),
354
+ ('act', nn.Tanh())
355
+ ]))
356
+ else:
357
+ self.pre_logits = nn.Identity()
358
+
359
+ self.apply(self._init_weights)
360
+ self.init_weights(pretrained=pretrained_path)
361
+
362
+ def init_weights(self, pretrained):
363
+ if isinstance(pretrained, str):
364
+ logger = get_root_logger()
365
+ load_checkpoint(self, pretrained, map_location='cpu', strict=False, logger=logger)
366
+ print(f'Load pretrained model from {pretrained}')
367
+ def _init_weights(self, m):
368
+ if isinstance(m, nn.Linear):
369
+ trunc_normal_(m.weight, std=.02)
370
+ if isinstance(m, nn.Linear) and m.bias is not None:
371
+ nn.init.constant_(m.bias, 0)
372
+ elif isinstance(m, nn.LayerNorm):
373
+ nn.init.constant_(m.bias, 0)
374
+ nn.init.constant_(m.weight, 1.0)
375
+
376
+ @torch.jit.ignore
377
+ def no_weight_decay(self):
378
+ return {'pos_embed', 'cls_token'}
379
+
380
+ def get_classifier(self):
381
+ return self.head
382
+
383
+ def reset_classifier(self, num_classes, global_pool=''):
384
+ self.num_classes = num_classes
385
+ self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
386
+
387
+ def forward_features(self, x):
388
+ out = []
389
+ x = self.patch_embed1(x)
390
+ x = self.pos_drop(x)
391
+ for i, blk in enumerate(self.blocks1):
392
+ if self.use_checkpoint and i < self.checkpoint_num[0]:
393
+ x = checkpoint.checkpoint(blk, x)
394
+ else:
395
+ x = blk(x)
396
+ x_out = self.norm1(x.permute(0, 2, 3, 1))
397
+ out.append(x_out.permute(0, 3, 1, 2).contiguous())
398
+ x = self.patch_embed2(x)
399
+ for i, blk in enumerate(self.blocks2):
400
+ if self.use_checkpoint and i < self.checkpoint_num[1]:
401
+ x = checkpoint.checkpoint(blk, x)
402
+ else:
403
+ x = blk(x)
404
+ x_out = self.norm2(x.permute(0, 2, 3, 1))
405
+ out.append(x_out.permute(0, 3, 1, 2).contiguous())
406
+ x = self.patch_embed3(x)
407
+ for i, blk in enumerate(self.blocks3):
408
+ if self.use_checkpoint and i < self.checkpoint_num[2]:
409
+ x = checkpoint.checkpoint(blk, x)
410
+ else:
411
+ x = blk(x)
412
+ x_out = self.norm3(x.permute(0, 2, 3, 1))
413
+ out.append(x_out.permute(0, 3, 1, 2).contiguous())
414
+ x = self.patch_embed4(x)
415
+ for i, blk in enumerate(self.blocks4):
416
+ if self.use_checkpoint and i < self.checkpoint_num[3]:
417
+ x = checkpoint.checkpoint(blk, x)
418
+ else:
419
+ x = blk(x)
420
+ x_out = self.norm4(x.permute(0, 2, 3, 1))
421
+ out.append(x_out.permute(0, 3, 1, 2).contiguous())
422
+ return tuple(out)
423
+
424
+ def forward(self, x):
425
+ x = self.forward_features(x)
426
+ return x
extensions/microsoftexcel-controlnet/annotator/uniformer/upernet_global_small.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = [
2
+ 'configs/_base_/models/upernet_uniformer.py',
3
+ 'configs/_base_/datasets/ade20k.py',
4
+ 'configs/_base_/default_runtime.py',
5
+ 'configs/_base_/schedules/schedule_160k.py'
6
+ ]
7
+
8
+ custom_imports = dict(
9
+ imports=['annotator.uniformer.uniformer'],
10
+ allow_failed_imports=False
11
+ )
12
+
13
+ model = dict(
14
+ backbone=dict(
15
+ type='UniFormer',
16
+ embed_dim=[64, 128, 320, 512],
17
+ layers=[3, 4, 8, 3],
18
+ head_dim=64,
19
+ drop_path_rate=0.25,
20
+ windows=False,
21
+ hybrid=False
22
+ ),
23
+ decode_head=dict(
24
+ in_channels=[64, 128, 320, 512],
25
+ num_classes=150
26
+ ),
27
+ auxiliary_head=dict(
28
+ in_channels=320,
29
+ num_classes=150
30
+ ))
31
+
32
+ # AdamW optimizer, no weight decay for position embedding & layer norm in backbone
33
+ optimizer = dict(_delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01,
34
+ paramwise_cfg=dict(custom_keys={'absolute_pos_embed': dict(decay_mult=0.),
35
+ 'relative_position_bias_table': dict(decay_mult=0.),
36
+ 'norm': dict(decay_mult=0.)}))
37
+
38
+ lr_config = dict(_delete_=True, policy='poly',
39
+ warmup='linear',
40
+ warmup_iters=1500,
41
+ warmup_ratio=1e-6,
42
+ power=1.0, min_lr=0.0, by_epoch=False)
43
+
44
+ data=dict(samples_per_gpu=2)
extensions/microsoftexcel-controlnet/annotator/util.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import cv2
3
+
4
+
5
+ def HWC3(x):
6
+ assert x.dtype == np.uint8
7
+ if x.ndim == 2:
8
+ x = x[:, :, None]
9
+ assert x.ndim == 3
10
+ H, W, C = x.shape
11
+ assert C == 1 or C == 3 or C == 4
12
+ if C == 3:
13
+ return x
14
+ if C == 1:
15
+ return np.concatenate([x, x, x], axis=2)
16
+ if C == 4:
17
+ color = x[:, :, 0:3].astype(np.float32)
18
+ alpha = x[:, :, 3:4].astype(np.float32) / 255.0
19
+ y = color * alpha + 255.0 * (1.0 - alpha)
20
+ y = y.clip(0, 255).astype(np.uint8)
21
+ return y
22
+
23
+
24
+ def make_noise_disk(H, W, C, F):
25
+ noise = np.random.uniform(low=0, high=1, size=((H // F) + 2, (W // F) + 2, C))
26
+ noise = cv2.resize(noise, (W + 2 * F, H + 2 * F), interpolation=cv2.INTER_CUBIC)
27
+ noise = noise[F: F + H, F: F + W]
28
+ noise -= np.min(noise)
29
+ noise /= np.max(noise)
30
+ if C == 1:
31
+ noise = noise[:, :, None]
32
+ return noise
33
+
34
+
35
+ def nms(x, t, s):
36
+ x = cv2.GaussianBlur(x.astype(np.float32), (0, 0), s)
37
+
38
+ f1 = np.array([[0, 0, 0], [1, 1, 1], [0, 0, 0]], dtype=np.uint8)
39
+ f2 = np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]], dtype=np.uint8)
40
+ f3 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=np.uint8)
41
+ f4 = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0]], dtype=np.uint8)
42
+
43
+ y = np.zeros_like(x)
44
+
45
+ for f in [f1, f2, f3, f4]:
46
+ np.putmask(y, cv2.dilate(x, kernel=f) == x, x)
47
+
48
+ z = np.zeros_like(y, dtype=np.uint8)
49
+ z[y > t] = 255
50
+ return z
51
+
52
+
53
+ def min_max_norm(x):
54
+ x -= np.min(x)
55
+ x /= np.maximum(np.max(x), 1e-5)
56
+ return x
57
+
58
+
59
+ def safe_step(x, step=2):
60
+ y = x.astype(np.float32) * float(step + 1)
61
+ y = y.astype(np.int32).astype(np.float32) / float(step)
62
+ return y
63
+
64
+
65
+ def img2mask(img, H, W, low=10, high=90):
66
+ assert img.ndim == 3 or img.ndim == 2
67
+ assert img.dtype == np.uint8
68
+
69
+ if img.ndim == 3:
70
+ y = img[:, :, random.randrange(0, img.shape[2])]
71
+ else:
72
+ y = img
73
+
74
+ y = cv2.resize(y, (W, H), interpolation=cv2.INTER_CUBIC)
75
+
76
+ if random.uniform(0, 1) < 0.5:
77
+ y = 255 - y
78
+
79
+ return y < np.percentile(y, random.randrange(low, high))
extensions/microsoftexcel-controlnet/annotator/zoe/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2022 Intelligent Systems Lab Org
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
extensions/microsoftexcel-controlnet/annotator/zoe/__init__.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import cv2
3
+ import numpy as np
4
+ import torch
5
+
6
+ from einops import rearrange
7
+ from .zoedepth.models.zoedepth.zoedepth_v1 import ZoeDepth
8
+ from .zoedepth.utils.config import get_config
9
+ from modules import devices
10
+ from annotator.annotator_path import models_path
11
+
12
+
13
+ class ZoeDetector:
14
+ model_dir = os.path.join(models_path, "zoedepth")
15
+
16
+ def __init__(self):
17
+ self.model = None
18
+ self.device = devices.get_device_for("controlnet")
19
+
20
+ def load_model(self):
21
+ remote_model_path = "https://huggingface.co/lllyasviel/Annotators/resolve/main/ZoeD_M12_N.pt"
22
+ modelpath = os.path.join(self.model_dir, "ZoeD_M12_N.pt")
23
+ if not os.path.exists(modelpath):
24
+ from basicsr.utils.download_util import load_file_from_url
25
+ load_file_from_url(remote_model_path, model_dir=self.model_dir)
26
+ conf = get_config("zoedepth", "infer")
27
+ model = ZoeDepth.build_from_config(conf)
28
+ model.load_state_dict(torch.load(modelpath, map_location=model.device)['model'])
29
+ model.eval()
30
+ self.model = model.to(self.device)
31
+
32
+ def unload_model(self):
33
+ if self.model is not None:
34
+ self.model.cpu()
35
+
36
+ def __call__(self, input_image):
37
+ if self.model is None:
38
+ self.load_model()
39
+ self.model.to(self.device)
40
+
41
+ assert input_image.ndim == 3
42
+ image_depth = input_image
43
+ with torch.no_grad():
44
+ image_depth = torch.from_numpy(image_depth).float().to(self.device)
45
+ image_depth = image_depth / 255.0
46
+ image_depth = rearrange(image_depth, 'h w c -> 1 c h w')
47
+ depth = self.model.infer(image_depth)
48
+
49
+ depth = depth[0, 0].cpu().numpy()
50
+
51
+ vmin = np.percentile(depth, 2)
52
+ vmax = np.percentile(depth, 85)
53
+
54
+ depth -= vmin
55
+ depth /= vmax - vmin
56
+ depth = 1.0 - depth
57
+ depth_image = (depth * 255.0).clip(0, 255).astype(np.uint8)
58
+
59
+ return depth_image
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/__init__.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+
3
+ # Copyright (c) 2022 Intelligent Systems Lab Org
4
+
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+
12
+ # The above copyright notice and this permission notice shall be included in all
13
+ # copies or substantial portions of the Software.
14
+
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ # File author: Shariq Farooq Bhat
24
+
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/__init__.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+
3
+ # Copyright (c) 2022 Intelligent Systems Lab Org
4
+
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+
12
+ # The above copyright notice and this permission notice shall be included in all
13
+ # copies or substantial portions of the Software.
14
+
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ # File author: Shariq Farooq Bhat
24
+
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas.py ADDED
@@ -0,0 +1,379 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+ import os
3
+
4
+ # Copyright (c) 2022 Intelligent Systems Lab Org
5
+
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+
13
+ # The above copyright notice and this permission notice shall be included in all
14
+ # copies or substantial portions of the Software.
15
+
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ # SOFTWARE.
23
+
24
+ # File author: Shariq Farooq Bhat
25
+
26
+ import torch
27
+ import torch.nn as nn
28
+ import numpy as np
29
+ from torchvision.transforms import Normalize
30
+
31
+
32
+ def denormalize(x):
33
+ """Reverses the imagenet normalization applied to the input.
34
+
35
+ Args:
36
+ x (torch.Tensor - shape(N,3,H,W)): input tensor
37
+
38
+ Returns:
39
+ torch.Tensor - shape(N,3,H,W): Denormalized input
40
+ """
41
+ mean = torch.Tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1).to(x.device)
42
+ std = torch.Tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1).to(x.device)
43
+ return x * std + mean
44
+
45
+ def get_activation(name, bank):
46
+ def hook(model, input, output):
47
+ bank[name] = output
48
+ return hook
49
+
50
+
51
+ class Resize(object):
52
+ """Resize sample to given size (width, height).
53
+ """
54
+
55
+ def __init__(
56
+ self,
57
+ width,
58
+ height,
59
+ resize_target=True,
60
+ keep_aspect_ratio=False,
61
+ ensure_multiple_of=1,
62
+ resize_method="lower_bound",
63
+ ):
64
+ """Init.
65
+ Args:
66
+ width (int): desired output width
67
+ height (int): desired output height
68
+ resize_target (bool, optional):
69
+ True: Resize the full sample (image, mask, target).
70
+ False: Resize image only.
71
+ Defaults to True.
72
+ keep_aspect_ratio (bool, optional):
73
+ True: Keep the aspect ratio of the input sample.
74
+ Output sample might not have the given width and height, and
75
+ resize behaviour depends on the parameter 'resize_method'.
76
+ Defaults to False.
77
+ ensure_multiple_of (int, optional):
78
+ Output width and height is constrained to be multiple of this parameter.
79
+ Defaults to 1.
80
+ resize_method (str, optional):
81
+ "lower_bound": Output will be at least as large as the given size.
82
+ "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
83
+ "minimal": Scale as least as possible. (Output size might be smaller than given size.)
84
+ Defaults to "lower_bound".
85
+ """
86
+ print("Params passed to Resize transform:")
87
+ print("\twidth: ", width)
88
+ print("\theight: ", height)
89
+ print("\tresize_target: ", resize_target)
90
+ print("\tkeep_aspect_ratio: ", keep_aspect_ratio)
91
+ print("\tensure_multiple_of: ", ensure_multiple_of)
92
+ print("\tresize_method: ", resize_method)
93
+
94
+ self.__width = width
95
+ self.__height = height
96
+
97
+ self.__keep_aspect_ratio = keep_aspect_ratio
98
+ self.__multiple_of = ensure_multiple_of
99
+ self.__resize_method = resize_method
100
+
101
+ def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
102
+ y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
103
+
104
+ if max_val is not None and y > max_val:
105
+ y = (np.floor(x / self.__multiple_of)
106
+ * self.__multiple_of).astype(int)
107
+
108
+ if y < min_val:
109
+ y = (np.ceil(x / self.__multiple_of)
110
+ * self.__multiple_of).astype(int)
111
+
112
+ return y
113
+
114
+ def get_size(self, width, height):
115
+ # determine new height and width
116
+ scale_height = self.__height / height
117
+ scale_width = self.__width / width
118
+
119
+ if self.__keep_aspect_ratio:
120
+ if self.__resize_method == "lower_bound":
121
+ # scale such that output size is lower bound
122
+ if scale_width > scale_height:
123
+ # fit width
124
+ scale_height = scale_width
125
+ else:
126
+ # fit height
127
+ scale_width = scale_height
128
+ elif self.__resize_method == "upper_bound":
129
+ # scale such that output size is upper bound
130
+ if scale_width < scale_height:
131
+ # fit width
132
+ scale_height = scale_width
133
+ else:
134
+ # fit height
135
+ scale_width = scale_height
136
+ elif self.__resize_method == "minimal":
137
+ # scale as least as possbile
138
+ if abs(1 - scale_width) < abs(1 - scale_height):
139
+ # fit width
140
+ scale_height = scale_width
141
+ else:
142
+ # fit height
143
+ scale_width = scale_height
144
+ else:
145
+ raise ValueError(
146
+ f"resize_method {self.__resize_method} not implemented"
147
+ )
148
+
149
+ if self.__resize_method == "lower_bound":
150
+ new_height = self.constrain_to_multiple_of(
151
+ scale_height * height, min_val=self.__height
152
+ )
153
+ new_width = self.constrain_to_multiple_of(
154
+ scale_width * width, min_val=self.__width
155
+ )
156
+ elif self.__resize_method == "upper_bound":
157
+ new_height = self.constrain_to_multiple_of(
158
+ scale_height * height, max_val=self.__height
159
+ )
160
+ new_width = self.constrain_to_multiple_of(
161
+ scale_width * width, max_val=self.__width
162
+ )
163
+ elif self.__resize_method == "minimal":
164
+ new_height = self.constrain_to_multiple_of(scale_height * height)
165
+ new_width = self.constrain_to_multiple_of(scale_width * width)
166
+ else:
167
+ raise ValueError(
168
+ f"resize_method {self.__resize_method} not implemented")
169
+
170
+ return (new_width, new_height)
171
+
172
+ def __call__(self, x):
173
+ width, height = self.get_size(*x.shape[-2:][::-1])
174
+ return nn.functional.interpolate(x, (height, width), mode='bilinear', align_corners=True)
175
+
176
+ class PrepForMidas(object):
177
+ def __init__(self, resize_mode="minimal", keep_aspect_ratio=True, img_size=384, do_resize=True):
178
+ if isinstance(img_size, int):
179
+ img_size = (img_size, img_size)
180
+ net_h, net_w = img_size
181
+ self.normalization = Normalize(
182
+ mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
183
+ self.resizer = Resize(net_w, net_h, keep_aspect_ratio=keep_aspect_ratio, ensure_multiple_of=32, resize_method=resize_mode) \
184
+ if do_resize else nn.Identity()
185
+
186
+ def __call__(self, x):
187
+ return self.normalization(self.resizer(x))
188
+
189
+
190
+ class MidasCore(nn.Module):
191
+ def __init__(self, midas, trainable=False, fetch_features=True, layer_names=('out_conv', 'l4_rn', 'r4', 'r3', 'r2', 'r1'), freeze_bn=False, keep_aspect_ratio=True,
192
+ img_size=384, **kwargs):
193
+ """Midas Base model used for multi-scale feature extraction.
194
+
195
+ Args:
196
+ midas (torch.nn.Module): Midas model.
197
+ trainable (bool, optional): Train midas model. Defaults to False.
198
+ fetch_features (bool, optional): Extract multi-scale features. Defaults to True.
199
+ layer_names (tuple, optional): Layers used for feature extraction. Order = (head output features, last layer features, ...decoder features). Defaults to ('out_conv', 'l4_rn', 'r4', 'r3', 'r2', 'r1').
200
+ freeze_bn (bool, optional): Freeze BatchNorm. Generally results in better finetuning performance. Defaults to False.
201
+ keep_aspect_ratio (bool, optional): Keep the aspect ratio of input images while resizing. Defaults to True.
202
+ img_size (int, tuple, optional): Input resolution. Defaults to 384.
203
+ """
204
+ super().__init__()
205
+ self.core = midas
206
+ self.output_channels = None
207
+ self.core_out = {}
208
+ self.trainable = trainable
209
+ self.fetch_features = fetch_features
210
+ # midas.scratch.output_conv = nn.Identity()
211
+ self.handles = []
212
+ # self.layer_names = ['out_conv','l4_rn', 'r4', 'r3', 'r2', 'r1']
213
+ self.layer_names = layer_names
214
+
215
+ self.set_trainable(trainable)
216
+ self.set_fetch_features(fetch_features)
217
+
218
+ self.prep = PrepForMidas(keep_aspect_ratio=keep_aspect_ratio,
219
+ img_size=img_size, do_resize=kwargs.get('do_resize', True))
220
+
221
+ if freeze_bn:
222
+ self.freeze_bn()
223
+
224
+ def set_trainable(self, trainable):
225
+ self.trainable = trainable
226
+ if trainable:
227
+ self.unfreeze()
228
+ else:
229
+ self.freeze()
230
+ return self
231
+
232
+ def set_fetch_features(self, fetch_features):
233
+ self.fetch_features = fetch_features
234
+ if fetch_features:
235
+ if len(self.handles) == 0:
236
+ self.attach_hooks(self.core)
237
+ else:
238
+ self.remove_hooks()
239
+ return self
240
+
241
+ def freeze(self):
242
+ for p in self.parameters():
243
+ p.requires_grad = False
244
+ self.trainable = False
245
+ return self
246
+
247
+ def unfreeze(self):
248
+ for p in self.parameters():
249
+ p.requires_grad = True
250
+ self.trainable = True
251
+ return self
252
+
253
+ def freeze_bn(self):
254
+ for m in self.modules():
255
+ if isinstance(m, nn.BatchNorm2d):
256
+ m.eval()
257
+ return self
258
+
259
+ def forward(self, x, denorm=False, return_rel_depth=False):
260
+ with torch.no_grad():
261
+ if denorm:
262
+ x = denormalize(x)
263
+ x = self.prep(x)
264
+ # print("Shape after prep: ", x.shape)
265
+
266
+ with torch.set_grad_enabled(self.trainable):
267
+
268
+ # print("Input size to Midascore", x.shape)
269
+ rel_depth = self.core(x)
270
+ # print("Output from midas shape", rel_depth.shape)
271
+ if not self.fetch_features:
272
+ return rel_depth
273
+ out = [self.core_out[k] for k in self.layer_names]
274
+
275
+ if return_rel_depth:
276
+ return rel_depth, out
277
+ return out
278
+
279
+ def get_rel_pos_params(self):
280
+ for name, p in self.core.pretrained.named_parameters():
281
+ if "relative_position" in name:
282
+ yield p
283
+
284
+ def get_enc_params_except_rel_pos(self):
285
+ for name, p in self.core.pretrained.named_parameters():
286
+ if "relative_position" not in name:
287
+ yield p
288
+
289
+ def freeze_encoder(self, freeze_rel_pos=False):
290
+ if freeze_rel_pos:
291
+ for p in self.core.pretrained.parameters():
292
+ p.requires_grad = False
293
+ else:
294
+ for p in self.get_enc_params_except_rel_pos():
295
+ p.requires_grad = False
296
+ return self
297
+
298
+ def attach_hooks(self, midas):
299
+ if len(self.handles) > 0:
300
+ self.remove_hooks()
301
+ if "out_conv" in self.layer_names:
302
+ self.handles.append(list(midas.scratch.output_conv.children())[
303
+ 3].register_forward_hook(get_activation("out_conv", self.core_out)))
304
+ if "r4" in self.layer_names:
305
+ self.handles.append(midas.scratch.refinenet4.register_forward_hook(
306
+ get_activation("r4", self.core_out)))
307
+ if "r3" in self.layer_names:
308
+ self.handles.append(midas.scratch.refinenet3.register_forward_hook(
309
+ get_activation("r3", self.core_out)))
310
+ if "r2" in self.layer_names:
311
+ self.handles.append(midas.scratch.refinenet2.register_forward_hook(
312
+ get_activation("r2", self.core_out)))
313
+ if "r1" in self.layer_names:
314
+ self.handles.append(midas.scratch.refinenet1.register_forward_hook(
315
+ get_activation("r1", self.core_out)))
316
+ if "l4_rn" in self.layer_names:
317
+ self.handles.append(midas.scratch.layer4_rn.register_forward_hook(
318
+ get_activation("l4_rn", self.core_out)))
319
+
320
+ return self
321
+
322
+ def remove_hooks(self):
323
+ for h in self.handles:
324
+ h.remove()
325
+ return self
326
+
327
+ def __del__(self):
328
+ self.remove_hooks()
329
+
330
+ def set_output_channels(self, model_type):
331
+ self.output_channels = MIDAS_SETTINGS[model_type]
332
+
333
+ @staticmethod
334
+ def build(midas_model_type="DPT_BEiT_L_384", train_midas=False, use_pretrained_midas=True, fetch_features=False, freeze_bn=True, force_keep_ar=False, force_reload=False, **kwargs):
335
+ if midas_model_type not in MIDAS_SETTINGS:
336
+ raise ValueError(
337
+ f"Invalid model type: {midas_model_type}. Must be one of {list(MIDAS_SETTINGS.keys())}")
338
+ if "img_size" in kwargs:
339
+ kwargs = MidasCore.parse_img_size(kwargs)
340
+ img_size = kwargs.pop("img_size", [384, 384])
341
+ print("img_size", img_size)
342
+ midas_path = os.path.join(os.path.dirname(__file__), 'midas_repo')
343
+ midas = torch.hub.load(midas_path, midas_model_type,
344
+ pretrained=use_pretrained_midas, force_reload=force_reload, source='local')
345
+ kwargs.update({'keep_aspect_ratio': force_keep_ar})
346
+ midas_core = MidasCore(midas, trainable=train_midas, fetch_features=fetch_features,
347
+ freeze_bn=freeze_bn, img_size=img_size, **kwargs)
348
+ midas_core.set_output_channels(midas_model_type)
349
+ return midas_core
350
+
351
+ @staticmethod
352
+ def build_from_config(config):
353
+ return MidasCore.build(**config)
354
+
355
+ @staticmethod
356
+ def parse_img_size(config):
357
+ assert 'img_size' in config
358
+ if isinstance(config['img_size'], str):
359
+ assert "," in config['img_size'], "img_size should be a string with comma separated img_size=H,W"
360
+ config['img_size'] = list(map(int, config['img_size'].split(",")))
361
+ assert len(
362
+ config['img_size']) == 2, "img_size should be a string with comma separated img_size=H,W"
363
+ elif isinstance(config['img_size'], int):
364
+ config['img_size'] = [config['img_size'], config['img_size']]
365
+ else:
366
+ assert isinstance(config['img_size'], list) and len(
367
+ config['img_size']) == 2, "img_size should be a list of H,W"
368
+ return config
369
+
370
+
371
+ nchannels2models = {
372
+ tuple([256]*5): ["DPT_BEiT_L_384", "DPT_BEiT_L_512", "DPT_BEiT_B_384", "DPT_SwinV2_L_384", "DPT_SwinV2_B_384", "DPT_SwinV2_T_256", "DPT_Large", "DPT_Hybrid"],
373
+ (512, 256, 128, 64, 64): ["MiDaS_small"]
374
+ }
375
+
376
+ # Model name to number of output channels
377
+ MIDAS_SETTINGS = {m: k for k, v in nchannels2models.items()
378
+ for m in v
379
+ }
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/.gitignore ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ *.egg-info/
24
+ .installed.cfg
25
+ *.egg
26
+ MANIFEST
27
+
28
+ # PyInstaller
29
+ # Usually these files are written by a python script from a template
30
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
31
+ *.manifest
32
+ *.spec
33
+
34
+ # Installer logs
35
+ pip-log.txt
36
+ pip-delete-this-directory.txt
37
+
38
+ # Unit test / coverage reports
39
+ htmlcov/
40
+ .tox/
41
+ .coverage
42
+ .coverage.*
43
+ .cache
44
+ nosetests.xml
45
+ coverage.xml
46
+ *.cover
47
+ .hypothesis/
48
+ .pytest_cache/
49
+
50
+ # Translations
51
+ *.mo
52
+ *.pot
53
+
54
+ # Django stuff:
55
+ *.log
56
+ local_settings.py
57
+ db.sqlite3
58
+
59
+ # Flask stuff:
60
+ instance/
61
+ .webassets-cache
62
+
63
+ # Scrapy stuff:
64
+ .scrapy
65
+
66
+ # Sphinx documentation
67
+ docs/_build/
68
+
69
+ # PyBuilder
70
+ target/
71
+
72
+ # Jupyter Notebook
73
+ .ipynb_checkpoints
74
+
75
+ # pyenv
76
+ .python-version
77
+
78
+ # celery beat schedule file
79
+ celerybeat-schedule
80
+
81
+ # SageMath parsed files
82
+ *.sage.py
83
+
84
+ # Environments
85
+ .env
86
+ .venv
87
+ env/
88
+ venv/
89
+ ENV/
90
+ env.bak/
91
+ venv.bak/
92
+
93
+ # Spyder project settings
94
+ .spyderproject
95
+ .spyproject
96
+
97
+ # Rope project settings
98
+ .ropeproject
99
+
100
+ # mkdocs documentation
101
+ /site
102
+
103
+ # mypy
104
+ .mypy_cache/
105
+
106
+ *.png
107
+ *.pfm
108
+ *.jpg
109
+ *.jpeg
110
+ *.pt
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/Dockerfile ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # enables cuda support in docker
2
+ FROM nvidia/cuda:10.2-cudnn7-runtime-ubuntu18.04
3
+
4
+ # install python 3.6, pip and requirements for opencv-python
5
+ # (see https://github.com/NVIDIA/nvidia-docker/issues/864)
6
+ RUN apt-get update && apt-get -y install \
7
+ python3 \
8
+ python3-pip \
9
+ libsm6 \
10
+ libxext6 \
11
+ libxrender-dev \
12
+ curl \
13
+ && rm -rf /var/lib/apt/lists/*
14
+
15
+ # install python dependencies
16
+ RUN pip3 install --upgrade pip
17
+ RUN pip3 install torch~=1.8 torchvision opencv-python-headless~=3.4 timm
18
+
19
+ # copy inference code
20
+ WORKDIR /opt/MiDaS
21
+ COPY ./midas ./midas
22
+ COPY ./*.py ./
23
+
24
+ # download model weights so the docker image can be used offline
25
+ RUN cd weights && {curl -OL https://github.com/isl-org/MiDaS/releases/download/v3/dpt_hybrid_384.pt; cd -; }
26
+ RUN python3 run.py --model_type dpt_hybrid; exit 0
27
+
28
+ # entrypoint (dont forget to mount input and output directories)
29
+ CMD python3 run.py --model_type dpt_hybrid
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2019 Intel ISL (Intel Intelligent Systems Lab)
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/README.md ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer
2
+
3
+ This repository contains code to compute depth from a single image. It accompanies our [paper](https://arxiv.org/abs/1907.01341v3):
4
+
5
+ >Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer
6
+ René Ranftl, Katrin Lasinger, David Hafner, Konrad Schindler, Vladlen Koltun
7
+
8
+
9
+ and our [preprint](https://arxiv.org/abs/2103.13413):
10
+
11
+ > Vision Transformers for Dense Prediction
12
+ > René Ranftl, Alexey Bochkovskiy, Vladlen Koltun
13
+
14
+
15
+ MiDaS was trained on up to 12 datasets (ReDWeb, DIML, Movies, MegaDepth, WSVD, TartanAir, HRWSI, ApolloScape, BlendedMVS, IRS, KITTI, NYU Depth V2) with
16
+ multi-objective optimization.
17
+ The original model that was trained on 5 datasets (`MIX 5` in the paper) can be found [here](https://github.com/isl-org/MiDaS/releases/tag/v2).
18
+ The figure below shows an overview of the different MiDaS models; the bubble size scales with number of parameters.
19
+
20
+ ![](figures/Improvement_vs_FPS.png)
21
+
22
+ ### Setup
23
+
24
+ 1) Pick one or more models and download the corresponding weights to the `weights` folder:
25
+
26
+ MiDaS 3.1
27
+ - For highest quality: [dpt_beit_large_512](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt)
28
+ - For moderately less quality, but better speed-performance trade-off: [dpt_swin2_large_384](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_large_384.pt)
29
+ - For embedded devices: [dpt_swin2_tiny_256](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_tiny_256.pt), [dpt_levit_224](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_levit_224.pt)
30
+ - For inference on Intel CPUs, OpenVINO may be used for the small legacy model: openvino_midas_v21_small [.xml](https://github.com/isl-org/MiDaS/releases/download/v3_1/openvino_midas_v21_small_256.xml), [.bin](https://github.com/isl-org/MiDaS/releases/download/v3_1/openvino_midas_v21_small_256.bin)
31
+
32
+ MiDaS 3.0: Legacy transformer models [dpt_large_384](https://github.com/isl-org/MiDaS/releases/download/v3/dpt_large_384.pt) and [dpt_hybrid_384](https://github.com/isl-org/MiDaS/releases/download/v3/dpt_hybrid_384.pt)
33
+
34
+ MiDaS 2.1: Legacy convolutional models [midas_v21_384](https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_384.pt) and [midas_v21_small_256](https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt)
35
+
36
+ 1) Set up dependencies:
37
+
38
+ ```shell
39
+ conda env create -f environment.yaml
40
+ conda activate midas-py310
41
+ ```
42
+
43
+ #### optional
44
+
45
+ For the Next-ViT model, execute
46
+
47
+ ```shell
48
+ git submodule add https://github.com/isl-org/Next-ViT midas/external/next_vit
49
+ ```
50
+
51
+ For the OpenVINO model, install
52
+
53
+ ```shell
54
+ pip install openvino
55
+ ```
56
+
57
+ ### Usage
58
+
59
+ 1) Place one or more input images in the folder `input`.
60
+
61
+ 2) Run the model with
62
+
63
+ ```shell
64
+ python run.py --model_type <model_type> --input_path input --output_path output
65
+ ```
66
+ where ```<model_type>``` is chosen from [dpt_beit_large_512](#model_type), [dpt_beit_large_384](#model_type),
67
+ [dpt_beit_base_384](#model_type), [dpt_swin2_large_384](#model_type), [dpt_swin2_base_384](#model_type),
68
+ [dpt_swin2_tiny_256](#model_type), [dpt_swin_large_384](#model_type), [dpt_next_vit_large_384](#model_type),
69
+ [dpt_levit_224](#model_type), [dpt_large_384](#model_type), [dpt_hybrid_384](#model_type),
70
+ [midas_v21_384](#model_type), [midas_v21_small_256](#model_type), [openvino_midas_v21_small_256](#model_type).
71
+
72
+ 3) The resulting depth maps are written to the `output` folder.
73
+
74
+ #### optional
75
+
76
+ 1) By default, the inference resizes the height of input images to the size of a model to fit into the encoder. This
77
+ size is given by the numbers in the model names of the [accuracy table](#accuracy). Some models do not only support a single
78
+ inference height but a range of different heights. Feel free to explore different heights by appending the extra
79
+ command line argument `--height`. Unsupported height values will throw an error. Note that using this argument may
80
+ decrease the model accuracy.
81
+ 2) By default, the inference keeps the aspect ratio of input images when feeding them into the encoder if this is
82
+ supported by a model (all models except for Swin, Swin2, LeViT). In order to resize to a square resolution,
83
+ disregarding the aspect ratio while preserving the height, use the command line argument `--square`.
84
+
85
+ #### via Camera
86
+
87
+ If you want the input images to be grabbed from the camera and shown in a window, leave the input and output paths
88
+ away and choose a model type as shown above:
89
+
90
+ ```shell
91
+ python run.py --model_type <model_type> --side
92
+ ```
93
+
94
+ The argument `--side` is optional and causes both the input RGB image and the output depth map to be shown
95
+ side-by-side for comparison.
96
+
97
+ #### via Docker
98
+
99
+ 1) Make sure you have installed Docker and the
100
+ [NVIDIA Docker runtime](https://github.com/NVIDIA/nvidia-docker/wiki/Installation-\(Native-GPU-Support\)).
101
+
102
+ 2) Build the Docker image:
103
+
104
+ ```shell
105
+ docker build -t midas .
106
+ ```
107
+
108
+ 3) Run inference:
109
+
110
+ ```shell
111
+ docker run --rm --gpus all -v $PWD/input:/opt/MiDaS/input -v $PWD/output:/opt/MiDaS/output -v $PWD/weights:/opt/MiDaS/weights midas
112
+ ```
113
+
114
+ This command passes through all of your NVIDIA GPUs to the container, mounts the
115
+ `input` and `output` directories and then runs the inference.
116
+
117
+ #### via PyTorch Hub
118
+
119
+ The pretrained model is also available on [PyTorch Hub](https://pytorch.org/hub/intelisl_midas_v2/)
120
+
121
+ #### via TensorFlow or ONNX
122
+
123
+ See [README](https://github.com/isl-org/MiDaS/tree/master/tf) in the `tf` subdirectory.
124
+
125
+ Currently only supports MiDaS v2.1.
126
+
127
+
128
+ #### via Mobile (iOS / Android)
129
+
130
+ See [README](https://github.com/isl-org/MiDaS/tree/master/mobile) in the `mobile` subdirectory.
131
+
132
+ #### via ROS1 (Robot Operating System)
133
+
134
+ See [README](https://github.com/isl-org/MiDaS/tree/master/ros) in the `ros` subdirectory.
135
+
136
+ Currently only supports MiDaS v2.1. DPT-based models to be added.
137
+
138
+
139
+ ### Accuracy
140
+
141
+ We provide a **zero-shot error** $\epsilon_d$ which is evaluated for 6 different datasets
142
+ (see [paper](https://arxiv.org/abs/1907.01341v3)). **Lower error values are better**.
143
+ $\color{green}{\textsf{Overall model quality is represented by the improvement}}$ ([Imp.](#improvement)) with respect to
144
+ MiDaS 3.0 DPT<sub>L-384</sub>. The models are grouped by the height used for inference, whereas the square training resolution is given by
145
+ the numbers in the model names. The table also shows the **number of parameters** (in millions) and the
146
+ **frames per second** for inference at the training resolution (for GPU RTX 3090):
147
+
148
+ | MiDaS Model | DIW </br><sup>WHDR</sup> | Eth3d </br><sup>AbsRel</sup> | Sintel </br><sup>AbsRel</sup> | TUM </br><sup>δ1</sup> | KITTI </br><sup>δ1</sup> | NYUv2 </br><sup>δ1</sup> | $\color{green}{\textsf{Imp.}}$ </br><sup>%</sup> | Par.</br><sup>M</sup> | FPS</br><sup>&nbsp;</sup> |
149
+ |-----------------------------------------------------------------------------------------------------------------------|-------------------------:|-----------------------------:|------------------------------:|-------------------------:|-------------------------:|-------------------------:|-------------------------------------------------:|----------------------:|--------------------------:|
150
+ | **Inference height 512** | | | | | | | | | |
151
+ | [v3.1 BEiT<sub>L-512</sub>](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt) | 0.1137 | 0.0659 | 0.2366 | **6.13** | 11.56* | **1.86*** | $\color{green}{\textsf{19}}$ | **345** | **5.7** |
152
+ | [v3.1 BEiT<sub>L-512</sub>](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt)$\tiny{\square}$ | **0.1121** | **0.0614** | **0.2090** | 6.46 | **5.00*** | 1.90* | $\color{green}{\textsf{34}}$ | **345** | **5.7** |
153
+ | | | | | | | | | | |
154
+ | **Inference height 384** | | | | | | | | | |
155
+ | [v3.1 BEiT<sub>L-512</sub>](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt) | 0.1245 | 0.0681 | **0.2176** | **6.13** | 6.28* | **2.16*** | $\color{green}{\textsf{28}}$ | 345 | 12 |
156
+ | [v3.1 Swin2<sub>L-384</sub>](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_large_384.pt)$\tiny{\square}$ | 0.1106 | 0.0732 | 0.2442 | 8.87 | **5.84*** | 2.92* | $\color{green}{\textsf{22}}$ | 213 | 41 |
157
+ | [v3.1 Swin2<sub>B-384</sub>](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_base_384.pt)$\tiny{\square}$ | 0.1095 | 0.0790 | 0.2404 | 8.93 | 5.97* | 3.28* | $\color{green}{\textsf{22}}$ | 102 | 39 |
158
+ | [v3.1 Swin<sub>L-384</sub>](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin_large_384.pt)$\tiny{\square}$ | 0.1126 | 0.0853 | 0.2428 | 8.74 | 6.60* | 3.34* | $\color{green}{\textsf{17}}$ | 213 | 49 |
159
+ | [v3.1 BEiT<sub>L-384</sub>](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_384.pt) | 0.1239 | **0.0667** | 0.2545 | 7.17 | 9.84* | 2.21* | $\color{green}{\textsf{17}}$ | 344 | 13 |
160
+ | [v3.1 Next-ViT<sub>L-384</sub>](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_next_vit_large_384.pt) | **0.1031** | 0.0954 | 0.2295 | 9.21 | 6.89* | 3.47* | $\color{green}{\textsf{16}}$ | **72** | 30 |
161
+ | [v3.1 BEiT<sub>B-384</sub>](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_base_384.pt) | 0.1159 | 0.0967 | 0.2901 | 9.88 | 26.60* | 3.91* | $\color{green}{\textsf{-31}}$ | 112 | 31 |
162
+ | [v3.0 DPT<sub>L-384</sub>](https://github.com/isl-org/MiDaS/releases/download/v3/dpt_large_384.pt) | 0.1082 | 0.0888 | 0.2697 | 9.97 | 8.46 | 8.32 | $\color{green}{\textsf{0}}$ | 344 | **61** |
163
+ | [v3.0 DPT<sub>H-384</sub>](https://github.com/isl-org/MiDaS/releases/download/v3/dpt_hybrid_384.pt) | 0.1106 | 0.0934 | 0.2741 | 10.89 | 11.56 | 8.69 | $\color{green}{\textsf{-10}}$ | 123 | 50 |
164
+ | [v2.1 Large<sub>384</sub>](https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_384.pt) | 0.1295 | 0.1155 | 0.3285 | 12.51 | 16.08 | 8.71 | $\color{green}{\textsf{-32}}$ | 105 | 47 |
165
+ | | | | | | | | | | |
166
+ | **Inference height 256** | | | | | | | | | |
167
+ | [v3.1 Swin2<sub>T-256</sub>](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_tiny_256.pt)$\tiny{\square}$ | **0.1211** | **0.1106** | **0.2868** | **13.43** | **10.13*** | **5.55*** | $\color{green}{\textsf{-11}}$ | 42 | 64 |
168
+ | [v2.1 Small<sub>256</sub>](https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt) | 0.1344 | 0.1344 | 0.3370 | 14.53 | 29.27 | 13.43 | $\color{green}{\textsf{-76}}$ | **21** | **90** |
169
+ | | | | | | | | | | |
170
+ | **Inference height 224** | | | | | | | | | |
171
+ | [v3.1 LeViT<sub>224</sub>](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_levit_224.pt)$\tiny{\square}$ | **0.1314** | **0.1206** | **0.3148** | **18.21** | **15.27*** | **8.64*** | $\color{green}{\textsf{-40}}$ | **51** | **73** |
172
+
173
+ &ast; No zero-shot error, because models are also trained on KITTI and NYU Depth V2\
174
+ $\square$ Validation performed at **square resolution**, either because the transformer encoder backbone of a model
175
+ does not support non-square resolutions (Swin, Swin2, LeViT) or for comparison with these models. All other
176
+ validations keep the aspect ratio. A difference in resolution limits the comparability of the zero-shot error and the
177
+ improvement, because these quantities are averages over the pixels of an image and do not take into account the
178
+ advantage of more details due to a higher resolution.\
179
+ Best values per column and same validation height in bold
180
+
181
+ #### Improvement
182
+
183
+ The improvement in the above table is defined as the relative zero-shot error with respect to MiDaS v3.0
184
+ DPT<sub>L-384</sub> and averaging over the datasets. So, if $\epsilon_d$ is the zero-shot error for dataset $d$, then
185
+ the $\color{green}{\textsf{improvement}}$ is given by $100(1-(1/6)\sum_d\epsilon_d/\epsilon_{d,\rm{DPT_{L-384}}})$%.
186
+
187
+ Note that the improvements of 10% for MiDaS v2.0 &rarr; v2.1 and 21% for MiDaS v2.1 &rarr; v3.0 are not visible from the
188
+ improvement column (Imp.) in the table but would require an evaluation with respect to MiDaS v2.1 Large<sub>384</sub>
189
+ and v2.0 Large<sub>384</sub> respectively instead of v3.0 DPT<sub>L-384</sub>.
190
+
191
+ ### Depth map comparison
192
+
193
+ Zoom in for better visibility
194
+ ![](figures/Comparison.png)
195
+
196
+ ### Speed on Camera Feed
197
+
198
+ Test configuration
199
+ - Windows 10
200
+ - 11th Gen Intel Core i7-1185G7 3.00GHz
201
+ - 16GB RAM
202
+ - Camera resolution 640x480
203
+ - openvino_midas_v21_small_256
204
+
205
+ Speed: 22 FPS
206
+
207
+ ### Changelog
208
+
209
+ * [Dec 2022] Released MiDaS v3.1:
210
+ - New models based on 5 different types of transformers ([BEiT](https://arxiv.org/pdf/2106.08254.pdf), [Swin2](https://arxiv.org/pdf/2111.09883.pdf), [Swin](https://arxiv.org/pdf/2103.14030.pdf), [Next-ViT](https://arxiv.org/pdf/2207.05501.pdf), [LeViT](https://arxiv.org/pdf/2104.01136.pdf))
211
+ - Training datasets extended from 10 to 12, including also KITTI and NYU Depth V2 using [BTS](https://github.com/cleinc/bts) split
212
+ - Best model, BEiT<sub>Large 512</sub>, with resolution 512x512, is on average about [28% more accurate](#Accuracy) than MiDaS v3.0
213
+ - Integrated live depth estimation from camera feed
214
+ * [Sep 2021] Integrated to [Huggingface Spaces](https://huggingface.co/spaces) with [Gradio](https://github.com/gradio-app/gradio). See [Gradio Web Demo](https://huggingface.co/spaces/akhaliq/DPT-Large).
215
+ * [Apr 2021] Released MiDaS v3.0:
216
+ - New models based on [Dense Prediction Transformers](https://arxiv.org/abs/2103.13413) are on average [21% more accurate](#Accuracy) than MiDaS v2.1
217
+ - Additional models can be found [here](https://github.com/isl-org/DPT)
218
+ * [Nov 2020] Released MiDaS v2.1:
219
+ - New model that was trained on 10 datasets and is on average about [10% more accurate](#Accuracy) than [MiDaS v2.0](https://github.com/isl-org/MiDaS/releases/tag/v2)
220
+ - New light-weight model that achieves [real-time performance](https://github.com/isl-org/MiDaS/tree/master/mobile) on mobile platforms.
221
+ - Sample applications for [iOS](https://github.com/isl-org/MiDaS/tree/master/mobile/ios) and [Android](https://github.com/isl-org/MiDaS/tree/master/mobile/android)
222
+ - [ROS package](https://github.com/isl-org/MiDaS/tree/master/ros) for easy deployment on robots
223
+ * [Jul 2020] Added TensorFlow and ONNX code. Added [online demo](http://35.202.76.57/).
224
+ * [Dec 2019] Released new version of MiDaS - the new model is significantly more accurate and robust
225
+ * [Jul 2019] Initial release of MiDaS ([Link](https://github.com/isl-org/MiDaS/releases/tag/v1))
226
+
227
+ ### Citation
228
+
229
+ Please cite our paper if you use this code or any of the models:
230
+ ```
231
+ @ARTICLE {Ranftl2022,
232
+ author = "Ren\'{e} Ranftl and Katrin Lasinger and David Hafner and Konrad Schindler and Vladlen Koltun",
233
+ title = "Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-Shot Cross-Dataset Transfer",
234
+ journal = "IEEE Transactions on Pattern Analysis and Machine Intelligence",
235
+ year = "2022",
236
+ volume = "44",
237
+ number = "3"
238
+ }
239
+ ```
240
+
241
+ If you use a DPT-based model, please also cite:
242
+
243
+ ```
244
+ @article{Ranftl2021,
245
+ author = {Ren\'{e} Ranftl and Alexey Bochkovskiy and Vladlen Koltun},
246
+ title = {Vision Transformers for Dense Prediction},
247
+ journal = {ICCV},
248
+ year = {2021},
249
+ }
250
+ ```
251
+
252
+ ### Acknowledgements
253
+
254
+ Our work builds on and uses code from [timm](https://github.com/rwightman/pytorch-image-models) and [Next-ViT](https://github.com/bytedance/Next-ViT).
255
+ We'd like to thank the authors for making these libraries available.
256
+
257
+ ### License
258
+
259
+ MIT License
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/environment.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: midas-py310
2
+ channels:
3
+ - pytorch
4
+ - defaults
5
+ dependencies:
6
+ - nvidia::cudatoolkit=11.7
7
+ - python=3.10.8
8
+ - pytorch::pytorch=1.13.0
9
+ - torchvision=0.14.0
10
+ - pip=22.3.1
11
+ - numpy=1.23.4
12
+ - pip:
13
+ - opencv-python==4.6.0.66
14
+ - imutils==0.5.4
15
+ - timm==0.6.12
16
+ - einops==0.6.0
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/hubconf.py ADDED
@@ -0,0 +1,435 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dependencies = ["torch"]
2
+
3
+ import torch
4
+
5
+ from midas.dpt_depth import DPTDepthModel
6
+ from midas.midas_net import MidasNet
7
+ from midas.midas_net_custom import MidasNet_small
8
+
9
+ def DPT_BEiT_L_512(pretrained=True, **kwargs):
10
+ """ # This docstring shows up in hub.help()
11
+ MiDaS DPT_BEiT_L_512 model for monocular depth estimation
12
+ pretrained (bool): load pretrained weights into model
13
+ """
14
+
15
+ model = DPTDepthModel(
16
+ path=None,
17
+ backbone="beitl16_512",
18
+ non_negative=True,
19
+ )
20
+
21
+ if pretrained:
22
+ checkpoint = (
23
+ "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt"
24
+ )
25
+ state_dict = torch.hub.load_state_dict_from_url(
26
+ checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True
27
+ )
28
+ model.load_state_dict(state_dict)
29
+
30
+ return model
31
+
32
+ def DPT_BEiT_L_384(pretrained=True, **kwargs):
33
+ """ # This docstring shows up in hub.help()
34
+ MiDaS DPT_BEiT_L_384 model for monocular depth estimation
35
+ pretrained (bool): load pretrained weights into model
36
+ """
37
+
38
+ model = DPTDepthModel(
39
+ path=None,
40
+ backbone="beitl16_384",
41
+ non_negative=True,
42
+ )
43
+
44
+ if pretrained:
45
+ checkpoint = (
46
+ "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_384.pt"
47
+ )
48
+ state_dict = torch.hub.load_state_dict_from_url(
49
+ checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True
50
+ )
51
+ model.load_state_dict(state_dict)
52
+
53
+ return model
54
+
55
+ def DPT_BEiT_B_384(pretrained=True, **kwargs):
56
+ """ # This docstring shows up in hub.help()
57
+ MiDaS DPT_BEiT_B_384 model for monocular depth estimation
58
+ pretrained (bool): load pretrained weights into model
59
+ """
60
+
61
+ model = DPTDepthModel(
62
+ path=None,
63
+ backbone="beitb16_384",
64
+ non_negative=True,
65
+ )
66
+
67
+ if pretrained:
68
+ checkpoint = (
69
+ "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_base_384.pt"
70
+ )
71
+ state_dict = torch.hub.load_state_dict_from_url(
72
+ checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True
73
+ )
74
+ model.load_state_dict(state_dict)
75
+
76
+ return model
77
+
78
+ def DPT_SwinV2_L_384(pretrained=True, **kwargs):
79
+ """ # This docstring shows up in hub.help()
80
+ MiDaS DPT_SwinV2_L_384 model for monocular depth estimation
81
+ pretrained (bool): load pretrained weights into model
82
+ """
83
+
84
+ model = DPTDepthModel(
85
+ path=None,
86
+ backbone="swin2l24_384",
87
+ non_negative=True,
88
+ )
89
+
90
+ if pretrained:
91
+ checkpoint = (
92
+ "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_large_384.pt"
93
+ )
94
+ state_dict = torch.hub.load_state_dict_from_url(
95
+ checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True
96
+ )
97
+ model.load_state_dict(state_dict)
98
+
99
+ return model
100
+
101
+ def DPT_SwinV2_B_384(pretrained=True, **kwargs):
102
+ """ # This docstring shows up in hub.help()
103
+ MiDaS DPT_SwinV2_B_384 model for monocular depth estimation
104
+ pretrained (bool): load pretrained weights into model
105
+ """
106
+
107
+ model = DPTDepthModel(
108
+ path=None,
109
+ backbone="swin2b24_384",
110
+ non_negative=True,
111
+ )
112
+
113
+ if pretrained:
114
+ checkpoint = (
115
+ "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_base_384.pt"
116
+ )
117
+ state_dict = torch.hub.load_state_dict_from_url(
118
+ checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True
119
+ )
120
+ model.load_state_dict(state_dict)
121
+
122
+ return model
123
+
124
+ def DPT_SwinV2_T_256(pretrained=True, **kwargs):
125
+ """ # This docstring shows up in hub.help()
126
+ MiDaS DPT_SwinV2_T_256 model for monocular depth estimation
127
+ pretrained (bool): load pretrained weights into model
128
+ """
129
+
130
+ model = DPTDepthModel(
131
+ path=None,
132
+ backbone="swin2t16_256",
133
+ non_negative=True,
134
+ )
135
+
136
+ if pretrained:
137
+ checkpoint = (
138
+ "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_tiny_256.pt"
139
+ )
140
+ state_dict = torch.hub.load_state_dict_from_url(
141
+ checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True
142
+ )
143
+ model.load_state_dict(state_dict)
144
+
145
+ return model
146
+
147
+ def DPT_Swin_L_384(pretrained=True, **kwargs):
148
+ """ # This docstring shows up in hub.help()
149
+ MiDaS DPT_Swin_L_384 model for monocular depth estimation
150
+ pretrained (bool): load pretrained weights into model
151
+ """
152
+
153
+ model = DPTDepthModel(
154
+ path=None,
155
+ backbone="swinl12_384",
156
+ non_negative=True,
157
+ )
158
+
159
+ if pretrained:
160
+ checkpoint = (
161
+ "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin_large_384.pt"
162
+ )
163
+ state_dict = torch.hub.load_state_dict_from_url(
164
+ checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True
165
+ )
166
+ model.load_state_dict(state_dict)
167
+
168
+ return model
169
+
170
+ def DPT_Next_ViT_L_384(pretrained=True, **kwargs):
171
+ """ # This docstring shows up in hub.help()
172
+ MiDaS DPT_Next_ViT_L_384 model for monocular depth estimation
173
+ pretrained (bool): load pretrained weights into model
174
+ """
175
+
176
+ model = DPTDepthModel(
177
+ path=None,
178
+ backbone="next_vit_large_6m",
179
+ non_negative=True,
180
+ )
181
+
182
+ if pretrained:
183
+ checkpoint = (
184
+ "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_next_vit_large_384.pt"
185
+ )
186
+ state_dict = torch.hub.load_state_dict_from_url(
187
+ checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True
188
+ )
189
+ model.load_state_dict(state_dict)
190
+
191
+ return model
192
+
193
+ def DPT_LeViT_224(pretrained=True, **kwargs):
194
+ """ # This docstring shows up in hub.help()
195
+ MiDaS DPT_LeViT_224 model for monocular depth estimation
196
+ pretrained (bool): load pretrained weights into model
197
+ """
198
+
199
+ model = DPTDepthModel(
200
+ path=None,
201
+ backbone="levit_384",
202
+ non_negative=True,
203
+ head_features_1=64,
204
+ head_features_2=8,
205
+ )
206
+
207
+ if pretrained:
208
+ checkpoint = (
209
+ "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_levit_224.pt"
210
+ )
211
+ state_dict = torch.hub.load_state_dict_from_url(
212
+ checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True
213
+ )
214
+ model.load_state_dict(state_dict)
215
+
216
+ return model
217
+
218
+ def DPT_Large(pretrained=True, **kwargs):
219
+ """ # This docstring shows up in hub.help()
220
+ MiDaS DPT-Large model for monocular depth estimation
221
+ pretrained (bool): load pretrained weights into model
222
+ """
223
+
224
+ model = DPTDepthModel(
225
+ path=None,
226
+ backbone="vitl16_384",
227
+ non_negative=True,
228
+ )
229
+
230
+ if pretrained:
231
+ checkpoint = (
232
+ "https://github.com/isl-org/MiDaS/releases/download/v3/dpt_large_384.pt"
233
+ )
234
+ state_dict = torch.hub.load_state_dict_from_url(
235
+ checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True
236
+ )
237
+ model.load_state_dict(state_dict)
238
+
239
+ return model
240
+
241
+ def DPT_Hybrid(pretrained=True, **kwargs):
242
+ """ # This docstring shows up in hub.help()
243
+ MiDaS DPT-Hybrid model for monocular depth estimation
244
+ pretrained (bool): load pretrained weights into model
245
+ """
246
+
247
+ model = DPTDepthModel(
248
+ path=None,
249
+ backbone="vitb_rn50_384",
250
+ non_negative=True,
251
+ )
252
+
253
+ if pretrained:
254
+ checkpoint = (
255
+ "https://github.com/isl-org/MiDaS/releases/download/v3/dpt_hybrid_384.pt"
256
+ )
257
+ state_dict = torch.hub.load_state_dict_from_url(
258
+ checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True
259
+ )
260
+ model.load_state_dict(state_dict)
261
+
262
+ return model
263
+
264
+ def MiDaS(pretrained=True, **kwargs):
265
+ """ # This docstring shows up in hub.help()
266
+ MiDaS v2.1 model for monocular depth estimation
267
+ pretrained (bool): load pretrained weights into model
268
+ """
269
+
270
+ model = MidasNet()
271
+
272
+ if pretrained:
273
+ checkpoint = (
274
+ "https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_384.pt"
275
+ )
276
+ state_dict = torch.hub.load_state_dict_from_url(
277
+ checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True
278
+ )
279
+ model.load_state_dict(state_dict)
280
+
281
+ return model
282
+
283
+ def MiDaS_small(pretrained=True, **kwargs):
284
+ """ # This docstring shows up in hub.help()
285
+ MiDaS v2.1 small model for monocular depth estimation on resource-constrained devices
286
+ pretrained (bool): load pretrained weights into model
287
+ """
288
+
289
+ model = MidasNet_small(None, features=64, backbone="efficientnet_lite3", exportable=True, non_negative=True, blocks={'expand': True})
290
+
291
+ if pretrained:
292
+ checkpoint = (
293
+ "https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt"
294
+ )
295
+ state_dict = torch.hub.load_state_dict_from_url(
296
+ checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True
297
+ )
298
+ model.load_state_dict(state_dict)
299
+
300
+ return model
301
+
302
+
303
+ def transforms():
304
+ import cv2
305
+ from torchvision.transforms import Compose
306
+ from midas.transforms import Resize, NormalizeImage, PrepareForNet
307
+ from midas import transforms
308
+
309
+ transforms.default_transform = Compose(
310
+ [
311
+ lambda img: {"image": img / 255.0},
312
+ Resize(
313
+ 384,
314
+ 384,
315
+ resize_target=None,
316
+ keep_aspect_ratio=True,
317
+ ensure_multiple_of=32,
318
+ resize_method="upper_bound",
319
+ image_interpolation_method=cv2.INTER_CUBIC,
320
+ ),
321
+ NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
322
+ PrepareForNet(),
323
+ lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0),
324
+ ]
325
+ )
326
+
327
+ transforms.small_transform = Compose(
328
+ [
329
+ lambda img: {"image": img / 255.0},
330
+ Resize(
331
+ 256,
332
+ 256,
333
+ resize_target=None,
334
+ keep_aspect_ratio=True,
335
+ ensure_multiple_of=32,
336
+ resize_method="upper_bound",
337
+ image_interpolation_method=cv2.INTER_CUBIC,
338
+ ),
339
+ NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
340
+ PrepareForNet(),
341
+ lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0),
342
+ ]
343
+ )
344
+
345
+ transforms.dpt_transform = Compose(
346
+ [
347
+ lambda img: {"image": img / 255.0},
348
+ Resize(
349
+ 384,
350
+ 384,
351
+ resize_target=None,
352
+ keep_aspect_ratio=True,
353
+ ensure_multiple_of=32,
354
+ resize_method="minimal",
355
+ image_interpolation_method=cv2.INTER_CUBIC,
356
+ ),
357
+ NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
358
+ PrepareForNet(),
359
+ lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0),
360
+ ]
361
+ )
362
+
363
+ transforms.beit512_transform = Compose(
364
+ [
365
+ lambda img: {"image": img / 255.0},
366
+ Resize(
367
+ 512,
368
+ 512,
369
+ resize_target=None,
370
+ keep_aspect_ratio=True,
371
+ ensure_multiple_of=32,
372
+ resize_method="minimal",
373
+ image_interpolation_method=cv2.INTER_CUBIC,
374
+ ),
375
+ NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
376
+ PrepareForNet(),
377
+ lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0),
378
+ ]
379
+ )
380
+
381
+ transforms.swin384_transform = Compose(
382
+ [
383
+ lambda img: {"image": img / 255.0},
384
+ Resize(
385
+ 384,
386
+ 384,
387
+ resize_target=None,
388
+ keep_aspect_ratio=False,
389
+ ensure_multiple_of=32,
390
+ resize_method="minimal",
391
+ image_interpolation_method=cv2.INTER_CUBIC,
392
+ ),
393
+ NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
394
+ PrepareForNet(),
395
+ lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0),
396
+ ]
397
+ )
398
+
399
+ transforms.swin256_transform = Compose(
400
+ [
401
+ lambda img: {"image": img / 255.0},
402
+ Resize(
403
+ 256,
404
+ 256,
405
+ resize_target=None,
406
+ keep_aspect_ratio=False,
407
+ ensure_multiple_of=32,
408
+ resize_method="minimal",
409
+ image_interpolation_method=cv2.INTER_CUBIC,
410
+ ),
411
+ NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
412
+ PrepareForNet(),
413
+ lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0),
414
+ ]
415
+ )
416
+
417
+ transforms.levit_transform = Compose(
418
+ [
419
+ lambda img: {"image": img / 255.0},
420
+ Resize(
421
+ 224,
422
+ 224,
423
+ resize_target=None,
424
+ keep_aspect_ratio=False,
425
+ ensure_multiple_of=32,
426
+ resize_method="minimal",
427
+ image_interpolation_method=cv2.INTER_CUBIC,
428
+ ),
429
+ NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
430
+ PrepareForNet(),
431
+ lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0),
432
+ ]
433
+ )
434
+
435
+ return transforms
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/input/.placeholder ADDED
File without changes
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/beit.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import timm
2
+ import torch
3
+ import types
4
+
5
+ import numpy as np
6
+ import torch.nn.functional as F
7
+
8
+ from .utils import forward_adapted_unflatten, make_backbone_default
9
+ from timm.models.beit import gen_relative_position_index
10
+ from torch.utils.checkpoint import checkpoint
11
+ from typing import Optional
12
+
13
+
14
+ def forward_beit(pretrained, x):
15
+ return forward_adapted_unflatten(pretrained, x, "forward_features")
16
+
17
+
18
+ def patch_embed_forward(self, x):
19
+ """
20
+ Modification of timm.models.layers.patch_embed.py: PatchEmbed.forward to support arbitrary window sizes.
21
+ """
22
+ x = self.proj(x)
23
+ if self.flatten:
24
+ x = x.flatten(2).transpose(1, 2)
25
+ x = self.norm(x)
26
+ return x
27
+
28
+
29
+ def _get_rel_pos_bias(self, window_size):
30
+ """
31
+ Modification of timm.models.beit.py: Attention._get_rel_pos_bias to support arbitrary window sizes.
32
+ """
33
+ old_height = 2 * self.window_size[0] - 1
34
+ old_width = 2 * self.window_size[1] - 1
35
+
36
+ new_height = 2 * window_size[0] - 1
37
+ new_width = 2 * window_size[1] - 1
38
+
39
+ old_relative_position_bias_table = self.relative_position_bias_table
40
+
41
+ old_num_relative_distance = self.num_relative_distance
42
+ new_num_relative_distance = new_height * new_width + 3
43
+
44
+ old_sub_table = old_relative_position_bias_table[:old_num_relative_distance - 3]
45
+
46
+ old_sub_table = old_sub_table.reshape(1, old_width, old_height, -1).permute(0, 3, 1, 2)
47
+ new_sub_table = F.interpolate(old_sub_table, size=(new_height, new_width), mode="bilinear")
48
+ new_sub_table = new_sub_table.permute(0, 2, 3, 1).reshape(new_num_relative_distance - 3, -1)
49
+
50
+ new_relative_position_bias_table = torch.cat(
51
+ [new_sub_table, old_relative_position_bias_table[old_num_relative_distance - 3:]])
52
+
53
+ key = str(window_size[1]) + "," + str(window_size[0])
54
+ if key not in self.relative_position_indices.keys():
55
+ self.relative_position_indices[key] = gen_relative_position_index(window_size)
56
+
57
+ relative_position_bias = new_relative_position_bias_table[
58
+ self.relative_position_indices[key].view(-1)].view(
59
+ window_size[0] * window_size[1] + 1,
60
+ window_size[0] * window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH
61
+ relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww
62
+ return relative_position_bias.unsqueeze(0)
63
+
64
+
65
+ def attention_forward(self, x, resolution, shared_rel_pos_bias: Optional[torch.Tensor] = None):
66
+ """
67
+ Modification of timm.models.beit.py: Attention.forward to support arbitrary window sizes.
68
+ """
69
+ B, N, C = x.shape
70
+
71
+ qkv_bias = torch.cat((self.q_bias, self.k_bias, self.v_bias)) if self.q_bias is not None else None
72
+ qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
73
+ qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
74
+ q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple)
75
+
76
+ q = q * self.scale
77
+ attn = (q @ k.transpose(-2, -1))
78
+
79
+ if self.relative_position_bias_table is not None:
80
+ window_size = tuple(np.array(resolution) // 16)
81
+ attn = attn + self._get_rel_pos_bias(window_size)
82
+ if shared_rel_pos_bias is not None:
83
+ attn = attn + shared_rel_pos_bias
84
+
85
+ attn = attn.softmax(dim=-1)
86
+ attn = self.attn_drop(attn)
87
+
88
+ x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
89
+ x = self.proj(x)
90
+ x = self.proj_drop(x)
91
+ return x
92
+
93
+
94
+ def block_forward(self, x, resolution, shared_rel_pos_bias: Optional[torch.Tensor] = None):
95
+ """
96
+ Modification of timm.models.beit.py: Block.forward to support arbitrary window sizes.
97
+ """
98
+ if self.gamma_1 is None:
99
+ x = x + self.drop_path(self.attn(self.norm1(x), resolution, shared_rel_pos_bias=shared_rel_pos_bias))
100
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
101
+ else:
102
+ x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x), resolution,
103
+ shared_rel_pos_bias=shared_rel_pos_bias))
104
+ x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
105
+ return x
106
+
107
+
108
+ def beit_forward_features(self, x):
109
+ """
110
+ Modification of timm.models.beit.py: Beit.forward_features to support arbitrary window sizes.
111
+ """
112
+ resolution = x.shape[2:]
113
+
114
+ x = self.patch_embed(x)
115
+ x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
116
+ if self.pos_embed is not None:
117
+ x = x + self.pos_embed
118
+ x = self.pos_drop(x)
119
+
120
+ rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None
121
+ for blk in self.blocks:
122
+ if self.grad_checkpointing and not torch.jit.is_scripting():
123
+ x = checkpoint(blk, x, shared_rel_pos_bias=rel_pos_bias)
124
+ else:
125
+ x = blk(x, resolution, shared_rel_pos_bias=rel_pos_bias)
126
+ x = self.norm(x)
127
+ return x
128
+
129
+
130
+ def _make_beit_backbone(
131
+ model,
132
+ features=[96, 192, 384, 768],
133
+ size=[384, 384],
134
+ hooks=[0, 4, 8, 11],
135
+ vit_features=768,
136
+ use_readout="ignore",
137
+ start_index=1,
138
+ start_index_readout=1,
139
+ ):
140
+ backbone = make_backbone_default(model, features, size, hooks, vit_features, use_readout, start_index,
141
+ start_index_readout)
142
+
143
+ backbone.model.patch_embed.forward = types.MethodType(patch_embed_forward, backbone.model.patch_embed)
144
+ backbone.model.forward_features = types.MethodType(beit_forward_features, backbone.model)
145
+
146
+ for block in backbone.model.blocks:
147
+ attn = block.attn
148
+ attn._get_rel_pos_bias = types.MethodType(_get_rel_pos_bias, attn)
149
+ attn.forward = types.MethodType(attention_forward, attn)
150
+ attn.relative_position_indices = {}
151
+
152
+ block.forward = types.MethodType(block_forward, block)
153
+
154
+ return backbone
155
+
156
+
157
+ def _make_pretrained_beitl16_512(pretrained, use_readout="ignore", hooks=None):
158
+ model = timm.create_model("beit_large_patch16_512", pretrained=pretrained)
159
+
160
+ hooks = [5, 11, 17, 23] if hooks is None else hooks
161
+
162
+ features = [256, 512, 1024, 1024]
163
+
164
+ return _make_beit_backbone(
165
+ model,
166
+ features=features,
167
+ size=[512, 512],
168
+ hooks=hooks,
169
+ vit_features=1024,
170
+ use_readout=use_readout,
171
+ )
172
+
173
+
174
+ def _make_pretrained_beitl16_384(pretrained, use_readout="ignore", hooks=None):
175
+ model = timm.create_model("beit_large_patch16_384", pretrained=pretrained)
176
+
177
+ hooks = [5, 11, 17, 23] if hooks is None else hooks
178
+ return _make_beit_backbone(
179
+ model,
180
+ features=[256, 512, 1024, 1024],
181
+ hooks=hooks,
182
+ vit_features=1024,
183
+ use_readout=use_readout,
184
+ )
185
+
186
+
187
+ def _make_pretrained_beitb16_384(pretrained, use_readout="ignore", hooks=None):
188
+ model = timm.create_model("beit_base_patch16_384", pretrained=pretrained)
189
+
190
+ hooks = [2, 5, 8, 11] if hooks is None else hooks
191
+ return _make_beit_backbone(
192
+ model,
193
+ features=[96, 192, 384, 768],
194
+ hooks=hooks,
195
+ use_readout=use_readout,
196
+ )
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/levit.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import timm
2
+ import torch
3
+ import torch.nn as nn
4
+ import numpy as np
5
+
6
+ from .utils import activations, get_activation, Transpose
7
+
8
+
9
+ def forward_levit(pretrained, x):
10
+ pretrained.model.forward_features(x)
11
+
12
+ layer_1 = pretrained.activations["1"]
13
+ layer_2 = pretrained.activations["2"]
14
+ layer_3 = pretrained.activations["3"]
15
+
16
+ layer_1 = pretrained.act_postprocess1(layer_1)
17
+ layer_2 = pretrained.act_postprocess2(layer_2)
18
+ layer_3 = pretrained.act_postprocess3(layer_3)
19
+
20
+ return layer_1, layer_2, layer_3
21
+
22
+
23
+ def _make_levit_backbone(
24
+ model,
25
+ hooks=[3, 11, 21],
26
+ patch_grid=[14, 14]
27
+ ):
28
+ pretrained = nn.Module()
29
+
30
+ pretrained.model = model
31
+ pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
32
+ pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
33
+ pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
34
+
35
+ pretrained.activations = activations
36
+
37
+ patch_grid_size = np.array(patch_grid, dtype=int)
38
+
39
+ pretrained.act_postprocess1 = nn.Sequential(
40
+ Transpose(1, 2),
41
+ nn.Unflatten(2, torch.Size(patch_grid_size.tolist()))
42
+ )
43
+ pretrained.act_postprocess2 = nn.Sequential(
44
+ Transpose(1, 2),
45
+ nn.Unflatten(2, torch.Size((np.ceil(patch_grid_size / 2).astype(int)).tolist()))
46
+ )
47
+ pretrained.act_postprocess3 = nn.Sequential(
48
+ Transpose(1, 2),
49
+ nn.Unflatten(2, torch.Size((np.ceil(patch_grid_size / 4).astype(int)).tolist()))
50
+ )
51
+
52
+ return pretrained
53
+
54
+
55
+ class ConvTransposeNorm(nn.Sequential):
56
+ """
57
+ Modification of
58
+ https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/levit.py: ConvNorm
59
+ such that ConvTranspose2d is used instead of Conv2d.
60
+ """
61
+
62
+ def __init__(
63
+ self, in_chs, out_chs, kernel_size=1, stride=1, pad=0, dilation=1,
64
+ groups=1, bn_weight_init=1):
65
+ super().__init__()
66
+ self.add_module('c',
67
+ nn.ConvTranspose2d(in_chs, out_chs, kernel_size, stride, pad, dilation, groups, bias=False))
68
+ self.add_module('bn', nn.BatchNorm2d(out_chs))
69
+
70
+ nn.init.constant_(self.bn.weight, bn_weight_init)
71
+
72
+ @torch.no_grad()
73
+ def fuse(self):
74
+ c, bn = self._modules.values()
75
+ w = bn.weight / (bn.running_var + bn.eps) ** 0.5
76
+ w = c.weight * w[:, None, None, None]
77
+ b = bn.bias - bn.running_mean * bn.weight / (bn.running_var + bn.eps) ** 0.5
78
+ m = nn.ConvTranspose2d(
79
+ w.size(1), w.size(0), w.shape[2:], stride=self.c.stride,
80
+ padding=self.c.padding, dilation=self.c.dilation, groups=self.c.groups)
81
+ m.weight.data.copy_(w)
82
+ m.bias.data.copy_(b)
83
+ return m
84
+
85
+
86
+ def stem_b4_transpose(in_chs, out_chs, activation):
87
+ """
88
+ Modification of
89
+ https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/levit.py: stem_b16
90
+ such that ConvTranspose2d is used instead of Conv2d and stem is also reduced to the half.
91
+ """
92
+ return nn.Sequential(
93
+ ConvTransposeNorm(in_chs, out_chs, 3, 2, 1),
94
+ activation(),
95
+ ConvTransposeNorm(out_chs, out_chs // 2, 3, 2, 1),
96
+ activation())
97
+
98
+
99
+ def _make_pretrained_levit_384(pretrained, hooks=None):
100
+ model = timm.create_model("levit_384", pretrained=pretrained)
101
+
102
+ hooks = [3, 11, 21] if hooks == None else hooks
103
+ return _make_levit_backbone(
104
+ model,
105
+ hooks=hooks
106
+ )
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/next_vit.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import timm
2
+
3
+ import torch.nn as nn
4
+
5
+ from pathlib import Path
6
+ from .utils import activations, forward_default, get_activation
7
+
8
+ from ..external.next_vit.classification.nextvit import *
9
+
10
+
11
+ def forward_next_vit(pretrained, x):
12
+ return forward_default(pretrained, x, "forward")
13
+
14
+
15
+ def _make_next_vit_backbone(
16
+ model,
17
+ hooks=[2, 6, 36, 39],
18
+ ):
19
+ pretrained = nn.Module()
20
+
21
+ pretrained.model = model
22
+ pretrained.model.features[hooks[0]].register_forward_hook(get_activation("1"))
23
+ pretrained.model.features[hooks[1]].register_forward_hook(get_activation("2"))
24
+ pretrained.model.features[hooks[2]].register_forward_hook(get_activation("3"))
25
+ pretrained.model.features[hooks[3]].register_forward_hook(get_activation("4"))
26
+
27
+ pretrained.activations = activations
28
+
29
+ return pretrained
30
+
31
+
32
+ def _make_pretrained_next_vit_large_6m(hooks=None):
33
+ model = timm.create_model("nextvit_large")
34
+
35
+ hooks = [2, 6, 36, 39] if hooks == None else hooks
36
+ return _make_next_vit_backbone(
37
+ model,
38
+ hooks=hooks,
39
+ )
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/swin.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import timm
2
+
3
+ from .swin_common import _make_swin_backbone
4
+
5
+
6
+ def _make_pretrained_swinl12_384(pretrained, hooks=None):
7
+ model = timm.create_model("swin_large_patch4_window12_384", pretrained=pretrained)
8
+
9
+ hooks = [1, 1, 17, 1] if hooks == None else hooks
10
+ return _make_swin_backbone(
11
+ model,
12
+ hooks=hooks
13
+ )
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/swin2.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import timm
2
+
3
+ from .swin_common import _make_swin_backbone
4
+
5
+
6
+ def _make_pretrained_swin2l24_384(pretrained, hooks=None):
7
+ model = timm.create_model("swinv2_large_window12to24_192to384_22kft1k", pretrained=pretrained)
8
+
9
+ hooks = [1, 1, 17, 1] if hooks == None else hooks
10
+ return _make_swin_backbone(
11
+ model,
12
+ hooks=hooks
13
+ )
14
+
15
+
16
+ def _make_pretrained_swin2b24_384(pretrained, hooks=None):
17
+ model = timm.create_model("swinv2_base_window12to24_192to384_22kft1k", pretrained=pretrained)
18
+
19
+ hooks = [1, 1, 17, 1] if hooks == None else hooks
20
+ return _make_swin_backbone(
21
+ model,
22
+ hooks=hooks
23
+ )
24
+
25
+
26
+ def _make_pretrained_swin2t16_256(pretrained, hooks=None):
27
+ model = timm.create_model("swinv2_tiny_window16_256", pretrained=pretrained)
28
+
29
+ hooks = [1, 1, 5, 1] if hooks == None else hooks
30
+ return _make_swin_backbone(
31
+ model,
32
+ hooks=hooks,
33
+ patch_grid=[64, 64]
34
+ )