de0a7c5de67c68992c0d9cb5818a586853d0e4c757015f3aaf8dbb18421844e2
Browse files- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/swin_common.py +52 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/utils.py +249 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/vit.py +221 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/base_model.py +16 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/blocks.py +439 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/dpt_depth.py +166 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/midas_net.py +76 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/midas_net_custom.py +128 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/model_loader.py +242 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/transforms.py +234 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/output/.placeholder +0 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/LICENSE +21 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/README.md +131 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/additions/do_catkin_make.sh +5 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/additions/downloads.sh +5 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/additions/install_ros_melodic_ubuntu_17_18.sh +34 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/additions/install_ros_noetic_ubuntu_20.sh +33 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/additions/make_package_cpp.sh +16 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/launch_midas_cpp.sh +2 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/CMakeLists.txt +189 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/launch/midas_cpp.launch +19 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/launch/midas_talker_listener.launch +23 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/package.xml +77 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/scripts/listener.py +61 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/scripts/listener_original.py +61 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/scripts/talker.py +53 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/src/main.cpp +285 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/run_talker_listener_test.sh +16 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/run.py +277 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/tf/README.md +147 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/tf/input/.placeholder +0 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/tf/make_onnx_model.py +112 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/tf/output/.placeholder +0 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/tf/run_onnx.py +119 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/tf/run_pb.py +135 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/tf/transforms.py +234 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/tf/utils.py +82 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/utils.py +199 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/weights/.placeholder +0 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/builder.py +51 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/depth_model.py +152 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/layers/attractor.py +208 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/layers/dist_layers.py +121 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/layers/localbins_layers.py +169 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/layers/patch_transformer.py +91 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/model_io.py +92 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/zoedepth/__init__.py +31 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/zoedepth/config_zoedepth.json +58 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/zoedepth/config_zoedepth_kitti.json +22 -0
- extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/zoedepth/zoedepth_v1.py +250 -0
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/swin_common.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
import torch.nn as nn
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
from .utils import activations, forward_default, get_activation, Transpose
|
7 |
+
|
8 |
+
|
9 |
+
def forward_swin(pretrained, x):
|
10 |
+
return forward_default(pretrained, x)
|
11 |
+
|
12 |
+
|
13 |
+
def _make_swin_backbone(
|
14 |
+
model,
|
15 |
+
hooks=[1, 1, 17, 1],
|
16 |
+
patch_grid=[96, 96]
|
17 |
+
):
|
18 |
+
pretrained = nn.Module()
|
19 |
+
|
20 |
+
pretrained.model = model
|
21 |
+
pretrained.model.layers[0].blocks[hooks[0]].register_forward_hook(get_activation("1"))
|
22 |
+
pretrained.model.layers[1].blocks[hooks[1]].register_forward_hook(get_activation("2"))
|
23 |
+
pretrained.model.layers[2].blocks[hooks[2]].register_forward_hook(get_activation("3"))
|
24 |
+
pretrained.model.layers[3].blocks[hooks[3]].register_forward_hook(get_activation("4"))
|
25 |
+
|
26 |
+
pretrained.activations = activations
|
27 |
+
|
28 |
+
if hasattr(model, "patch_grid"):
|
29 |
+
used_patch_grid = model.patch_grid
|
30 |
+
else:
|
31 |
+
used_patch_grid = patch_grid
|
32 |
+
|
33 |
+
patch_grid_size = np.array(used_patch_grid, dtype=int)
|
34 |
+
|
35 |
+
pretrained.act_postprocess1 = nn.Sequential(
|
36 |
+
Transpose(1, 2),
|
37 |
+
nn.Unflatten(2, torch.Size(patch_grid_size.tolist()))
|
38 |
+
)
|
39 |
+
pretrained.act_postprocess2 = nn.Sequential(
|
40 |
+
Transpose(1, 2),
|
41 |
+
nn.Unflatten(2, torch.Size((patch_grid_size // 2).tolist()))
|
42 |
+
)
|
43 |
+
pretrained.act_postprocess3 = nn.Sequential(
|
44 |
+
Transpose(1, 2),
|
45 |
+
nn.Unflatten(2, torch.Size((patch_grid_size // 4).tolist()))
|
46 |
+
)
|
47 |
+
pretrained.act_postprocess4 = nn.Sequential(
|
48 |
+
Transpose(1, 2),
|
49 |
+
nn.Unflatten(2, torch.Size((patch_grid_size // 8).tolist()))
|
50 |
+
)
|
51 |
+
|
52 |
+
return pretrained
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/utils.py
ADDED
@@ -0,0 +1,249 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
import torch.nn as nn
|
4 |
+
|
5 |
+
|
6 |
+
class Slice(nn.Module):
|
7 |
+
def __init__(self, start_index=1):
|
8 |
+
super(Slice, self).__init__()
|
9 |
+
self.start_index = start_index
|
10 |
+
|
11 |
+
def forward(self, x):
|
12 |
+
return x[:, self.start_index:]
|
13 |
+
|
14 |
+
|
15 |
+
class AddReadout(nn.Module):
|
16 |
+
def __init__(self, start_index=1):
|
17 |
+
super(AddReadout, self).__init__()
|
18 |
+
self.start_index = start_index
|
19 |
+
|
20 |
+
def forward(self, x):
|
21 |
+
if self.start_index == 2:
|
22 |
+
readout = (x[:, 0] + x[:, 1]) / 2
|
23 |
+
else:
|
24 |
+
readout = x[:, 0]
|
25 |
+
return x[:, self.start_index:] + readout.unsqueeze(1)
|
26 |
+
|
27 |
+
|
28 |
+
class ProjectReadout(nn.Module):
|
29 |
+
def __init__(self, in_features, start_index=1):
|
30 |
+
super(ProjectReadout, self).__init__()
|
31 |
+
self.start_index = start_index
|
32 |
+
|
33 |
+
self.project = nn.Sequential(nn.Linear(2 * in_features, in_features), nn.GELU())
|
34 |
+
|
35 |
+
def forward(self, x):
|
36 |
+
readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index:])
|
37 |
+
features = torch.cat((x[:, self.start_index:], readout), -1)
|
38 |
+
|
39 |
+
return self.project(features)
|
40 |
+
|
41 |
+
|
42 |
+
class Transpose(nn.Module):
|
43 |
+
def __init__(self, dim0, dim1):
|
44 |
+
super(Transpose, self).__init__()
|
45 |
+
self.dim0 = dim0
|
46 |
+
self.dim1 = dim1
|
47 |
+
|
48 |
+
def forward(self, x):
|
49 |
+
x = x.transpose(self.dim0, self.dim1)
|
50 |
+
return x
|
51 |
+
|
52 |
+
|
53 |
+
activations = {}
|
54 |
+
|
55 |
+
|
56 |
+
def get_activation(name):
|
57 |
+
def hook(model, input, output):
|
58 |
+
activations[name] = output
|
59 |
+
|
60 |
+
return hook
|
61 |
+
|
62 |
+
|
63 |
+
def forward_default(pretrained, x, function_name="forward_features"):
|
64 |
+
exec(f"pretrained.model.{function_name}(x)")
|
65 |
+
|
66 |
+
layer_1 = pretrained.activations["1"]
|
67 |
+
layer_2 = pretrained.activations["2"]
|
68 |
+
layer_3 = pretrained.activations["3"]
|
69 |
+
layer_4 = pretrained.activations["4"]
|
70 |
+
|
71 |
+
if hasattr(pretrained, "act_postprocess1"):
|
72 |
+
layer_1 = pretrained.act_postprocess1(layer_1)
|
73 |
+
if hasattr(pretrained, "act_postprocess2"):
|
74 |
+
layer_2 = pretrained.act_postprocess2(layer_2)
|
75 |
+
if hasattr(pretrained, "act_postprocess3"):
|
76 |
+
layer_3 = pretrained.act_postprocess3(layer_3)
|
77 |
+
if hasattr(pretrained, "act_postprocess4"):
|
78 |
+
layer_4 = pretrained.act_postprocess4(layer_4)
|
79 |
+
|
80 |
+
return layer_1, layer_2, layer_3, layer_4
|
81 |
+
|
82 |
+
|
83 |
+
def forward_adapted_unflatten(pretrained, x, function_name="forward_features"):
|
84 |
+
b, c, h, w = x.shape
|
85 |
+
|
86 |
+
exec(f"glob = pretrained.model.{function_name}(x)")
|
87 |
+
|
88 |
+
layer_1 = pretrained.activations["1"]
|
89 |
+
layer_2 = pretrained.activations["2"]
|
90 |
+
layer_3 = pretrained.activations["3"]
|
91 |
+
layer_4 = pretrained.activations["4"]
|
92 |
+
|
93 |
+
layer_1 = pretrained.act_postprocess1[0:2](layer_1)
|
94 |
+
layer_2 = pretrained.act_postprocess2[0:2](layer_2)
|
95 |
+
layer_3 = pretrained.act_postprocess3[0:2](layer_3)
|
96 |
+
layer_4 = pretrained.act_postprocess4[0:2](layer_4)
|
97 |
+
|
98 |
+
unflatten = nn.Sequential(
|
99 |
+
nn.Unflatten(
|
100 |
+
2,
|
101 |
+
torch.Size(
|
102 |
+
[
|
103 |
+
h // pretrained.model.patch_size[1],
|
104 |
+
w // pretrained.model.patch_size[0],
|
105 |
+
]
|
106 |
+
),
|
107 |
+
)
|
108 |
+
)
|
109 |
+
|
110 |
+
if layer_1.ndim == 3:
|
111 |
+
layer_1 = unflatten(layer_1)
|
112 |
+
if layer_2.ndim == 3:
|
113 |
+
layer_2 = unflatten(layer_2)
|
114 |
+
if layer_3.ndim == 3:
|
115 |
+
layer_3 = unflatten(layer_3)
|
116 |
+
if layer_4.ndim == 3:
|
117 |
+
layer_4 = unflatten(layer_4)
|
118 |
+
|
119 |
+
layer_1 = pretrained.act_postprocess1[3: len(pretrained.act_postprocess1)](layer_1)
|
120 |
+
layer_2 = pretrained.act_postprocess2[3: len(pretrained.act_postprocess2)](layer_2)
|
121 |
+
layer_3 = pretrained.act_postprocess3[3: len(pretrained.act_postprocess3)](layer_3)
|
122 |
+
layer_4 = pretrained.act_postprocess4[3: len(pretrained.act_postprocess4)](layer_4)
|
123 |
+
|
124 |
+
return layer_1, layer_2, layer_3, layer_4
|
125 |
+
|
126 |
+
|
127 |
+
def get_readout_oper(vit_features, features, use_readout, start_index=1):
|
128 |
+
if use_readout == "ignore":
|
129 |
+
readout_oper = [Slice(start_index)] * len(features)
|
130 |
+
elif use_readout == "add":
|
131 |
+
readout_oper = [AddReadout(start_index)] * len(features)
|
132 |
+
elif use_readout == "project":
|
133 |
+
readout_oper = [
|
134 |
+
ProjectReadout(vit_features, start_index) for out_feat in features
|
135 |
+
]
|
136 |
+
else:
|
137 |
+
assert (
|
138 |
+
False
|
139 |
+
), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'"
|
140 |
+
|
141 |
+
return readout_oper
|
142 |
+
|
143 |
+
|
144 |
+
def make_backbone_default(
|
145 |
+
model,
|
146 |
+
features=[96, 192, 384, 768],
|
147 |
+
size=[384, 384],
|
148 |
+
hooks=[2, 5, 8, 11],
|
149 |
+
vit_features=768,
|
150 |
+
use_readout="ignore",
|
151 |
+
start_index=1,
|
152 |
+
start_index_readout=1,
|
153 |
+
):
|
154 |
+
pretrained = nn.Module()
|
155 |
+
|
156 |
+
pretrained.model = model
|
157 |
+
pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
|
158 |
+
pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
|
159 |
+
pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
|
160 |
+
pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4"))
|
161 |
+
|
162 |
+
pretrained.activations = activations
|
163 |
+
|
164 |
+
readout_oper = get_readout_oper(vit_features, features, use_readout, start_index_readout)
|
165 |
+
|
166 |
+
# 32, 48, 136, 384
|
167 |
+
pretrained.act_postprocess1 = nn.Sequential(
|
168 |
+
readout_oper[0],
|
169 |
+
Transpose(1, 2),
|
170 |
+
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
|
171 |
+
nn.Conv2d(
|
172 |
+
in_channels=vit_features,
|
173 |
+
out_channels=features[0],
|
174 |
+
kernel_size=1,
|
175 |
+
stride=1,
|
176 |
+
padding=0,
|
177 |
+
),
|
178 |
+
nn.ConvTranspose2d(
|
179 |
+
in_channels=features[0],
|
180 |
+
out_channels=features[0],
|
181 |
+
kernel_size=4,
|
182 |
+
stride=4,
|
183 |
+
padding=0,
|
184 |
+
bias=True,
|
185 |
+
dilation=1,
|
186 |
+
groups=1,
|
187 |
+
),
|
188 |
+
)
|
189 |
+
|
190 |
+
pretrained.act_postprocess2 = nn.Sequential(
|
191 |
+
readout_oper[1],
|
192 |
+
Transpose(1, 2),
|
193 |
+
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
|
194 |
+
nn.Conv2d(
|
195 |
+
in_channels=vit_features,
|
196 |
+
out_channels=features[1],
|
197 |
+
kernel_size=1,
|
198 |
+
stride=1,
|
199 |
+
padding=0,
|
200 |
+
),
|
201 |
+
nn.ConvTranspose2d(
|
202 |
+
in_channels=features[1],
|
203 |
+
out_channels=features[1],
|
204 |
+
kernel_size=2,
|
205 |
+
stride=2,
|
206 |
+
padding=0,
|
207 |
+
bias=True,
|
208 |
+
dilation=1,
|
209 |
+
groups=1,
|
210 |
+
),
|
211 |
+
)
|
212 |
+
|
213 |
+
pretrained.act_postprocess3 = nn.Sequential(
|
214 |
+
readout_oper[2],
|
215 |
+
Transpose(1, 2),
|
216 |
+
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
|
217 |
+
nn.Conv2d(
|
218 |
+
in_channels=vit_features,
|
219 |
+
out_channels=features[2],
|
220 |
+
kernel_size=1,
|
221 |
+
stride=1,
|
222 |
+
padding=0,
|
223 |
+
),
|
224 |
+
)
|
225 |
+
|
226 |
+
pretrained.act_postprocess4 = nn.Sequential(
|
227 |
+
readout_oper[3],
|
228 |
+
Transpose(1, 2),
|
229 |
+
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
|
230 |
+
nn.Conv2d(
|
231 |
+
in_channels=vit_features,
|
232 |
+
out_channels=features[3],
|
233 |
+
kernel_size=1,
|
234 |
+
stride=1,
|
235 |
+
padding=0,
|
236 |
+
),
|
237 |
+
nn.Conv2d(
|
238 |
+
in_channels=features[3],
|
239 |
+
out_channels=features[3],
|
240 |
+
kernel_size=3,
|
241 |
+
stride=2,
|
242 |
+
padding=1,
|
243 |
+
),
|
244 |
+
)
|
245 |
+
|
246 |
+
pretrained.model.start_index = start_index
|
247 |
+
pretrained.model.patch_size = [16, 16]
|
248 |
+
|
249 |
+
return pretrained
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/vit.py
ADDED
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import timm
|
4 |
+
import types
|
5 |
+
import math
|
6 |
+
import torch.nn.functional as F
|
7 |
+
|
8 |
+
from .utils import (activations, forward_adapted_unflatten, get_activation, get_readout_oper,
|
9 |
+
make_backbone_default, Transpose)
|
10 |
+
|
11 |
+
|
12 |
+
def forward_vit(pretrained, x):
|
13 |
+
return forward_adapted_unflatten(pretrained, x, "forward_flex")
|
14 |
+
|
15 |
+
|
16 |
+
def _resize_pos_embed(self, posemb, gs_h, gs_w):
|
17 |
+
posemb_tok, posemb_grid = (
|
18 |
+
posemb[:, : self.start_index],
|
19 |
+
posemb[0, self.start_index:],
|
20 |
+
)
|
21 |
+
|
22 |
+
gs_old = int(math.sqrt(len(posemb_grid)))
|
23 |
+
|
24 |
+
posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2)
|
25 |
+
posemb_grid = F.interpolate(posemb_grid, size=(gs_h, gs_w), mode="bilinear")
|
26 |
+
posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1)
|
27 |
+
|
28 |
+
posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
|
29 |
+
|
30 |
+
return posemb
|
31 |
+
|
32 |
+
|
33 |
+
def forward_flex(self, x):
|
34 |
+
b, c, h, w = x.shape
|
35 |
+
|
36 |
+
pos_embed = self._resize_pos_embed(
|
37 |
+
self.pos_embed, h // self.patch_size[1], w // self.patch_size[0]
|
38 |
+
)
|
39 |
+
|
40 |
+
B = x.shape[0]
|
41 |
+
|
42 |
+
if hasattr(self.patch_embed, "backbone"):
|
43 |
+
x = self.patch_embed.backbone(x)
|
44 |
+
if isinstance(x, (list, tuple)):
|
45 |
+
x = x[-1] # last feature if backbone outputs list/tuple of features
|
46 |
+
|
47 |
+
x = self.patch_embed.proj(x).flatten(2).transpose(1, 2)
|
48 |
+
|
49 |
+
if getattr(self, "dist_token", None) is not None:
|
50 |
+
cls_tokens = self.cls_token.expand(
|
51 |
+
B, -1, -1
|
52 |
+
) # stole cls_tokens impl from Phil Wang, thanks
|
53 |
+
dist_token = self.dist_token.expand(B, -1, -1)
|
54 |
+
x = torch.cat((cls_tokens, dist_token, x), dim=1)
|
55 |
+
else:
|
56 |
+
if self.no_embed_class:
|
57 |
+
x = x + pos_embed
|
58 |
+
cls_tokens = self.cls_token.expand(
|
59 |
+
B, -1, -1
|
60 |
+
) # stole cls_tokens impl from Phil Wang, thanks
|
61 |
+
x = torch.cat((cls_tokens, x), dim=1)
|
62 |
+
|
63 |
+
if not self.no_embed_class:
|
64 |
+
x = x + pos_embed
|
65 |
+
x = self.pos_drop(x)
|
66 |
+
|
67 |
+
for blk in self.blocks:
|
68 |
+
x = blk(x)
|
69 |
+
|
70 |
+
x = self.norm(x)
|
71 |
+
|
72 |
+
return x
|
73 |
+
|
74 |
+
|
75 |
+
def _make_vit_b16_backbone(
|
76 |
+
model,
|
77 |
+
features=[96, 192, 384, 768],
|
78 |
+
size=[384, 384],
|
79 |
+
hooks=[2, 5, 8, 11],
|
80 |
+
vit_features=768,
|
81 |
+
use_readout="ignore",
|
82 |
+
start_index=1,
|
83 |
+
start_index_readout=1,
|
84 |
+
):
|
85 |
+
pretrained = make_backbone_default(model, features, size, hooks, vit_features, use_readout, start_index,
|
86 |
+
start_index_readout)
|
87 |
+
|
88 |
+
# We inject this function into the VisionTransformer instances so that
|
89 |
+
# we can use it with interpolated position embeddings without modifying the library source.
|
90 |
+
pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model)
|
91 |
+
pretrained.model._resize_pos_embed = types.MethodType(
|
92 |
+
_resize_pos_embed, pretrained.model
|
93 |
+
)
|
94 |
+
|
95 |
+
return pretrained
|
96 |
+
|
97 |
+
|
98 |
+
def _make_pretrained_vitl16_384(pretrained, use_readout="ignore", hooks=None):
|
99 |
+
model = timm.create_model("vit_large_patch16_384", pretrained=pretrained)
|
100 |
+
|
101 |
+
hooks = [5, 11, 17, 23] if hooks == None else hooks
|
102 |
+
return _make_vit_b16_backbone(
|
103 |
+
model,
|
104 |
+
features=[256, 512, 1024, 1024],
|
105 |
+
hooks=hooks,
|
106 |
+
vit_features=1024,
|
107 |
+
use_readout=use_readout,
|
108 |
+
)
|
109 |
+
|
110 |
+
|
111 |
+
def _make_pretrained_vitb16_384(pretrained, use_readout="ignore", hooks=None):
|
112 |
+
model = timm.create_model("vit_base_patch16_384", pretrained=pretrained)
|
113 |
+
|
114 |
+
hooks = [2, 5, 8, 11] if hooks == None else hooks
|
115 |
+
return _make_vit_b16_backbone(
|
116 |
+
model, features=[96, 192, 384, 768], hooks=hooks, use_readout=use_readout
|
117 |
+
)
|
118 |
+
|
119 |
+
|
120 |
+
def _make_vit_b_rn50_backbone(
|
121 |
+
model,
|
122 |
+
features=[256, 512, 768, 768],
|
123 |
+
size=[384, 384],
|
124 |
+
hooks=[0, 1, 8, 11],
|
125 |
+
vit_features=768,
|
126 |
+
patch_size=[16, 16],
|
127 |
+
number_stages=2,
|
128 |
+
use_vit_only=False,
|
129 |
+
use_readout="ignore",
|
130 |
+
start_index=1,
|
131 |
+
):
|
132 |
+
pretrained = nn.Module()
|
133 |
+
|
134 |
+
pretrained.model = model
|
135 |
+
|
136 |
+
used_number_stages = 0 if use_vit_only else number_stages
|
137 |
+
for s in range(used_number_stages):
|
138 |
+
pretrained.model.patch_embed.backbone.stages[s].register_forward_hook(
|
139 |
+
get_activation(str(s + 1))
|
140 |
+
)
|
141 |
+
for s in range(used_number_stages, 4):
|
142 |
+
pretrained.model.blocks[hooks[s]].register_forward_hook(get_activation(str(s + 1)))
|
143 |
+
|
144 |
+
pretrained.activations = activations
|
145 |
+
|
146 |
+
readout_oper = get_readout_oper(vit_features, features, use_readout, start_index)
|
147 |
+
|
148 |
+
for s in range(used_number_stages):
|
149 |
+
value = nn.Sequential(nn.Identity(), nn.Identity(), nn.Identity())
|
150 |
+
exec(f"pretrained.act_postprocess{s + 1}=value")
|
151 |
+
for s in range(used_number_stages, 4):
|
152 |
+
if s < number_stages:
|
153 |
+
final_layer = nn.ConvTranspose2d(
|
154 |
+
in_channels=features[s],
|
155 |
+
out_channels=features[s],
|
156 |
+
kernel_size=4 // (2 ** s),
|
157 |
+
stride=4 // (2 ** s),
|
158 |
+
padding=0,
|
159 |
+
bias=True,
|
160 |
+
dilation=1,
|
161 |
+
groups=1,
|
162 |
+
)
|
163 |
+
elif s > number_stages:
|
164 |
+
final_layer = nn.Conv2d(
|
165 |
+
in_channels=features[3],
|
166 |
+
out_channels=features[3],
|
167 |
+
kernel_size=3,
|
168 |
+
stride=2,
|
169 |
+
padding=1,
|
170 |
+
)
|
171 |
+
else:
|
172 |
+
final_layer = None
|
173 |
+
|
174 |
+
layers = [
|
175 |
+
readout_oper[s],
|
176 |
+
Transpose(1, 2),
|
177 |
+
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
|
178 |
+
nn.Conv2d(
|
179 |
+
in_channels=vit_features,
|
180 |
+
out_channels=features[s],
|
181 |
+
kernel_size=1,
|
182 |
+
stride=1,
|
183 |
+
padding=0,
|
184 |
+
),
|
185 |
+
]
|
186 |
+
if final_layer is not None:
|
187 |
+
layers.append(final_layer)
|
188 |
+
|
189 |
+
value = nn.Sequential(*layers)
|
190 |
+
exec(f"pretrained.act_postprocess{s + 1}=value")
|
191 |
+
|
192 |
+
pretrained.model.start_index = start_index
|
193 |
+
pretrained.model.patch_size = patch_size
|
194 |
+
|
195 |
+
# We inject this function into the VisionTransformer instances so that
|
196 |
+
# we can use it with interpolated position embeddings without modifying the library source.
|
197 |
+
pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model)
|
198 |
+
|
199 |
+
# We inject this function into the VisionTransformer instances so that
|
200 |
+
# we can use it with interpolated position embeddings without modifying the library source.
|
201 |
+
pretrained.model._resize_pos_embed = types.MethodType(
|
202 |
+
_resize_pos_embed, pretrained.model
|
203 |
+
)
|
204 |
+
|
205 |
+
return pretrained
|
206 |
+
|
207 |
+
|
208 |
+
def _make_pretrained_vitb_rn50_384(
|
209 |
+
pretrained, use_readout="ignore", hooks=None, use_vit_only=False
|
210 |
+
):
|
211 |
+
model = timm.create_model("vit_base_resnet50_384", pretrained=pretrained)
|
212 |
+
|
213 |
+
hooks = [0, 1, 8, 11] if hooks == None else hooks
|
214 |
+
return _make_vit_b_rn50_backbone(
|
215 |
+
model,
|
216 |
+
features=[256, 512, 768, 768],
|
217 |
+
size=[384, 384],
|
218 |
+
hooks=hooks,
|
219 |
+
use_vit_only=use_vit_only,
|
220 |
+
use_readout=use_readout,
|
221 |
+
)
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/base_model.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
|
4 |
+
class BaseModel(torch.nn.Module):
|
5 |
+
def load(self, path):
|
6 |
+
"""Load model from file.
|
7 |
+
|
8 |
+
Args:
|
9 |
+
path (str): file path
|
10 |
+
"""
|
11 |
+
parameters = torch.load(path, map_location=torch.device('cpu'))
|
12 |
+
|
13 |
+
if "optimizer" in parameters:
|
14 |
+
parameters = parameters["model"]
|
15 |
+
|
16 |
+
self.load_state_dict(parameters)
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/blocks.py
ADDED
@@ -0,0 +1,439 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
|
4 |
+
from .backbones.beit import (
|
5 |
+
_make_pretrained_beitl16_512,
|
6 |
+
_make_pretrained_beitl16_384,
|
7 |
+
_make_pretrained_beitb16_384,
|
8 |
+
forward_beit,
|
9 |
+
)
|
10 |
+
from .backbones.swin_common import (
|
11 |
+
forward_swin,
|
12 |
+
)
|
13 |
+
from .backbones.swin2 import (
|
14 |
+
_make_pretrained_swin2l24_384,
|
15 |
+
_make_pretrained_swin2b24_384,
|
16 |
+
_make_pretrained_swin2t16_256,
|
17 |
+
)
|
18 |
+
from .backbones.swin import (
|
19 |
+
_make_pretrained_swinl12_384,
|
20 |
+
)
|
21 |
+
from .backbones.levit import (
|
22 |
+
_make_pretrained_levit_384,
|
23 |
+
forward_levit,
|
24 |
+
)
|
25 |
+
from .backbones.vit import (
|
26 |
+
_make_pretrained_vitb_rn50_384,
|
27 |
+
_make_pretrained_vitl16_384,
|
28 |
+
_make_pretrained_vitb16_384,
|
29 |
+
forward_vit,
|
30 |
+
)
|
31 |
+
|
32 |
+
def _make_encoder(backbone, features, use_pretrained, groups=1, expand=False, exportable=True, hooks=None,
|
33 |
+
use_vit_only=False, use_readout="ignore", in_features=[96, 256, 512, 1024]):
|
34 |
+
if backbone == "beitl16_512":
|
35 |
+
pretrained = _make_pretrained_beitl16_512(
|
36 |
+
use_pretrained, hooks=hooks, use_readout=use_readout
|
37 |
+
)
|
38 |
+
scratch = _make_scratch(
|
39 |
+
[256, 512, 1024, 1024], features, groups=groups, expand=expand
|
40 |
+
) # BEiT_512-L (backbone)
|
41 |
+
elif backbone == "beitl16_384":
|
42 |
+
pretrained = _make_pretrained_beitl16_384(
|
43 |
+
use_pretrained, hooks=hooks, use_readout=use_readout
|
44 |
+
)
|
45 |
+
scratch = _make_scratch(
|
46 |
+
[256, 512, 1024, 1024], features, groups=groups, expand=expand
|
47 |
+
) # BEiT_384-L (backbone)
|
48 |
+
elif backbone == "beitb16_384":
|
49 |
+
pretrained = _make_pretrained_beitb16_384(
|
50 |
+
use_pretrained, hooks=hooks, use_readout=use_readout
|
51 |
+
)
|
52 |
+
scratch = _make_scratch(
|
53 |
+
[96, 192, 384, 768], features, groups=groups, expand=expand
|
54 |
+
) # BEiT_384-B (backbone)
|
55 |
+
elif backbone == "swin2l24_384":
|
56 |
+
pretrained = _make_pretrained_swin2l24_384(
|
57 |
+
use_pretrained, hooks=hooks
|
58 |
+
)
|
59 |
+
scratch = _make_scratch(
|
60 |
+
[192, 384, 768, 1536], features, groups=groups, expand=expand
|
61 |
+
) # Swin2-L/12to24 (backbone)
|
62 |
+
elif backbone == "swin2b24_384":
|
63 |
+
pretrained = _make_pretrained_swin2b24_384(
|
64 |
+
use_pretrained, hooks=hooks
|
65 |
+
)
|
66 |
+
scratch = _make_scratch(
|
67 |
+
[128, 256, 512, 1024], features, groups=groups, expand=expand
|
68 |
+
) # Swin2-B/12to24 (backbone)
|
69 |
+
elif backbone == "swin2t16_256":
|
70 |
+
pretrained = _make_pretrained_swin2t16_256(
|
71 |
+
use_pretrained, hooks=hooks
|
72 |
+
)
|
73 |
+
scratch = _make_scratch(
|
74 |
+
[96, 192, 384, 768], features, groups=groups, expand=expand
|
75 |
+
) # Swin2-T/16 (backbone)
|
76 |
+
elif backbone == "swinl12_384":
|
77 |
+
pretrained = _make_pretrained_swinl12_384(
|
78 |
+
use_pretrained, hooks=hooks
|
79 |
+
)
|
80 |
+
scratch = _make_scratch(
|
81 |
+
[192, 384, 768, 1536], features, groups=groups, expand=expand
|
82 |
+
) # Swin-L/12 (backbone)
|
83 |
+
elif backbone == "next_vit_large_6m":
|
84 |
+
from .backbones.next_vit import _make_pretrained_next_vit_large_6m
|
85 |
+
pretrained = _make_pretrained_next_vit_large_6m(hooks=hooks)
|
86 |
+
scratch = _make_scratch(
|
87 |
+
in_features, features, groups=groups, expand=expand
|
88 |
+
) # Next-ViT-L on ImageNet-1K-6M (backbone)
|
89 |
+
elif backbone == "levit_384":
|
90 |
+
pretrained = _make_pretrained_levit_384(
|
91 |
+
use_pretrained, hooks=hooks
|
92 |
+
)
|
93 |
+
scratch = _make_scratch(
|
94 |
+
[384, 512, 768], features, groups=groups, expand=expand
|
95 |
+
) # LeViT 384 (backbone)
|
96 |
+
elif backbone == "vitl16_384":
|
97 |
+
pretrained = _make_pretrained_vitl16_384(
|
98 |
+
use_pretrained, hooks=hooks, use_readout=use_readout
|
99 |
+
)
|
100 |
+
scratch = _make_scratch(
|
101 |
+
[256, 512, 1024, 1024], features, groups=groups, expand=expand
|
102 |
+
) # ViT-L/16 - 85.0% Top1 (backbone)
|
103 |
+
elif backbone == "vitb_rn50_384":
|
104 |
+
pretrained = _make_pretrained_vitb_rn50_384(
|
105 |
+
use_pretrained,
|
106 |
+
hooks=hooks,
|
107 |
+
use_vit_only=use_vit_only,
|
108 |
+
use_readout=use_readout,
|
109 |
+
)
|
110 |
+
scratch = _make_scratch(
|
111 |
+
[256, 512, 768, 768], features, groups=groups, expand=expand
|
112 |
+
) # ViT-H/16 - 85.0% Top1 (backbone)
|
113 |
+
elif backbone == "vitb16_384":
|
114 |
+
pretrained = _make_pretrained_vitb16_384(
|
115 |
+
use_pretrained, hooks=hooks, use_readout=use_readout
|
116 |
+
)
|
117 |
+
scratch = _make_scratch(
|
118 |
+
[96, 192, 384, 768], features, groups=groups, expand=expand
|
119 |
+
) # ViT-B/16 - 84.6% Top1 (backbone)
|
120 |
+
elif backbone == "resnext101_wsl":
|
121 |
+
pretrained = _make_pretrained_resnext101_wsl(use_pretrained)
|
122 |
+
scratch = _make_scratch([256, 512, 1024, 2048], features, groups=groups, expand=expand) # efficientnet_lite3
|
123 |
+
elif backbone == "efficientnet_lite3":
|
124 |
+
pretrained = _make_pretrained_efficientnet_lite3(use_pretrained, exportable=exportable)
|
125 |
+
scratch = _make_scratch([32, 48, 136, 384], features, groups=groups, expand=expand) # efficientnet_lite3
|
126 |
+
else:
|
127 |
+
print(f"Backbone '{backbone}' not implemented")
|
128 |
+
assert False
|
129 |
+
|
130 |
+
return pretrained, scratch
|
131 |
+
|
132 |
+
|
133 |
+
def _make_scratch(in_shape, out_shape, groups=1, expand=False):
|
134 |
+
scratch = nn.Module()
|
135 |
+
|
136 |
+
out_shape1 = out_shape
|
137 |
+
out_shape2 = out_shape
|
138 |
+
out_shape3 = out_shape
|
139 |
+
if len(in_shape) >= 4:
|
140 |
+
out_shape4 = out_shape
|
141 |
+
|
142 |
+
if expand:
|
143 |
+
out_shape1 = out_shape
|
144 |
+
out_shape2 = out_shape*2
|
145 |
+
out_shape3 = out_shape*4
|
146 |
+
if len(in_shape) >= 4:
|
147 |
+
out_shape4 = out_shape*8
|
148 |
+
|
149 |
+
scratch.layer1_rn = nn.Conv2d(
|
150 |
+
in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
|
151 |
+
)
|
152 |
+
scratch.layer2_rn = nn.Conv2d(
|
153 |
+
in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
|
154 |
+
)
|
155 |
+
scratch.layer3_rn = nn.Conv2d(
|
156 |
+
in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
|
157 |
+
)
|
158 |
+
if len(in_shape) >= 4:
|
159 |
+
scratch.layer4_rn = nn.Conv2d(
|
160 |
+
in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
|
161 |
+
)
|
162 |
+
|
163 |
+
return scratch
|
164 |
+
|
165 |
+
|
166 |
+
def _make_pretrained_efficientnet_lite3(use_pretrained, exportable=False):
|
167 |
+
efficientnet = torch.hub.load(
|
168 |
+
"rwightman/gen-efficientnet-pytorch",
|
169 |
+
"tf_efficientnet_lite3",
|
170 |
+
pretrained=use_pretrained,
|
171 |
+
exportable=exportable
|
172 |
+
)
|
173 |
+
return _make_efficientnet_backbone(efficientnet)
|
174 |
+
|
175 |
+
|
176 |
+
def _make_efficientnet_backbone(effnet):
|
177 |
+
pretrained = nn.Module()
|
178 |
+
|
179 |
+
pretrained.layer1 = nn.Sequential(
|
180 |
+
effnet.conv_stem, effnet.bn1, effnet.act1, *effnet.blocks[0:2]
|
181 |
+
)
|
182 |
+
pretrained.layer2 = nn.Sequential(*effnet.blocks[2:3])
|
183 |
+
pretrained.layer3 = nn.Sequential(*effnet.blocks[3:5])
|
184 |
+
pretrained.layer4 = nn.Sequential(*effnet.blocks[5:9])
|
185 |
+
|
186 |
+
return pretrained
|
187 |
+
|
188 |
+
|
189 |
+
def _make_resnet_backbone(resnet):
|
190 |
+
pretrained = nn.Module()
|
191 |
+
pretrained.layer1 = nn.Sequential(
|
192 |
+
resnet.conv1, resnet.bn1, resnet.relu, resnet.maxpool, resnet.layer1
|
193 |
+
)
|
194 |
+
|
195 |
+
pretrained.layer2 = resnet.layer2
|
196 |
+
pretrained.layer3 = resnet.layer3
|
197 |
+
pretrained.layer4 = resnet.layer4
|
198 |
+
|
199 |
+
return pretrained
|
200 |
+
|
201 |
+
|
202 |
+
def _make_pretrained_resnext101_wsl(use_pretrained):
|
203 |
+
resnet = torch.hub.load("facebookresearch/WSL-Images", "resnext101_32x8d_wsl")
|
204 |
+
return _make_resnet_backbone(resnet)
|
205 |
+
|
206 |
+
|
207 |
+
|
208 |
+
class Interpolate(nn.Module):
|
209 |
+
"""Interpolation module.
|
210 |
+
"""
|
211 |
+
|
212 |
+
def __init__(self, scale_factor, mode, align_corners=False):
|
213 |
+
"""Init.
|
214 |
+
|
215 |
+
Args:
|
216 |
+
scale_factor (float): scaling
|
217 |
+
mode (str): interpolation mode
|
218 |
+
"""
|
219 |
+
super(Interpolate, self).__init__()
|
220 |
+
|
221 |
+
self.interp = nn.functional.interpolate
|
222 |
+
self.scale_factor = scale_factor
|
223 |
+
self.mode = mode
|
224 |
+
self.align_corners = align_corners
|
225 |
+
|
226 |
+
def forward(self, x):
|
227 |
+
"""Forward pass.
|
228 |
+
|
229 |
+
Args:
|
230 |
+
x (tensor): input
|
231 |
+
|
232 |
+
Returns:
|
233 |
+
tensor: interpolated data
|
234 |
+
"""
|
235 |
+
|
236 |
+
x = self.interp(
|
237 |
+
x, scale_factor=self.scale_factor, mode=self.mode, align_corners=self.align_corners
|
238 |
+
)
|
239 |
+
|
240 |
+
return x
|
241 |
+
|
242 |
+
|
243 |
+
class ResidualConvUnit(nn.Module):
|
244 |
+
"""Residual convolution module.
|
245 |
+
"""
|
246 |
+
|
247 |
+
def __init__(self, features):
|
248 |
+
"""Init.
|
249 |
+
|
250 |
+
Args:
|
251 |
+
features (int): number of features
|
252 |
+
"""
|
253 |
+
super().__init__()
|
254 |
+
|
255 |
+
self.conv1 = nn.Conv2d(
|
256 |
+
features, features, kernel_size=3, stride=1, padding=1, bias=True
|
257 |
+
)
|
258 |
+
|
259 |
+
self.conv2 = nn.Conv2d(
|
260 |
+
features, features, kernel_size=3, stride=1, padding=1, bias=True
|
261 |
+
)
|
262 |
+
|
263 |
+
self.relu = nn.ReLU(inplace=True)
|
264 |
+
|
265 |
+
def forward(self, x):
|
266 |
+
"""Forward pass.
|
267 |
+
|
268 |
+
Args:
|
269 |
+
x (tensor): input
|
270 |
+
|
271 |
+
Returns:
|
272 |
+
tensor: output
|
273 |
+
"""
|
274 |
+
out = self.relu(x)
|
275 |
+
out = self.conv1(out)
|
276 |
+
out = self.relu(out)
|
277 |
+
out = self.conv2(out)
|
278 |
+
|
279 |
+
return out + x
|
280 |
+
|
281 |
+
|
282 |
+
class FeatureFusionBlock(nn.Module):
|
283 |
+
"""Feature fusion block.
|
284 |
+
"""
|
285 |
+
|
286 |
+
def __init__(self, features):
|
287 |
+
"""Init.
|
288 |
+
|
289 |
+
Args:
|
290 |
+
features (int): number of features
|
291 |
+
"""
|
292 |
+
super(FeatureFusionBlock, self).__init__()
|
293 |
+
|
294 |
+
self.resConfUnit1 = ResidualConvUnit(features)
|
295 |
+
self.resConfUnit2 = ResidualConvUnit(features)
|
296 |
+
|
297 |
+
def forward(self, *xs):
|
298 |
+
"""Forward pass.
|
299 |
+
|
300 |
+
Returns:
|
301 |
+
tensor: output
|
302 |
+
"""
|
303 |
+
output = xs[0]
|
304 |
+
|
305 |
+
if len(xs) == 2:
|
306 |
+
output += self.resConfUnit1(xs[1])
|
307 |
+
|
308 |
+
output = self.resConfUnit2(output)
|
309 |
+
|
310 |
+
output = nn.functional.interpolate(
|
311 |
+
output, scale_factor=2, mode="bilinear", align_corners=True
|
312 |
+
)
|
313 |
+
|
314 |
+
return output
|
315 |
+
|
316 |
+
|
317 |
+
|
318 |
+
|
319 |
+
class ResidualConvUnit_custom(nn.Module):
|
320 |
+
"""Residual convolution module.
|
321 |
+
"""
|
322 |
+
|
323 |
+
def __init__(self, features, activation, bn):
|
324 |
+
"""Init.
|
325 |
+
|
326 |
+
Args:
|
327 |
+
features (int): number of features
|
328 |
+
"""
|
329 |
+
super().__init__()
|
330 |
+
|
331 |
+
self.bn = bn
|
332 |
+
|
333 |
+
self.groups=1
|
334 |
+
|
335 |
+
self.conv1 = nn.Conv2d(
|
336 |
+
features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups
|
337 |
+
)
|
338 |
+
|
339 |
+
self.conv2 = nn.Conv2d(
|
340 |
+
features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups
|
341 |
+
)
|
342 |
+
|
343 |
+
if self.bn==True:
|
344 |
+
self.bn1 = nn.BatchNorm2d(features)
|
345 |
+
self.bn2 = nn.BatchNorm2d(features)
|
346 |
+
|
347 |
+
self.activation = activation
|
348 |
+
|
349 |
+
self.skip_add = nn.quantized.FloatFunctional()
|
350 |
+
|
351 |
+
def forward(self, x):
|
352 |
+
"""Forward pass.
|
353 |
+
|
354 |
+
Args:
|
355 |
+
x (tensor): input
|
356 |
+
|
357 |
+
Returns:
|
358 |
+
tensor: output
|
359 |
+
"""
|
360 |
+
|
361 |
+
out = self.activation(x)
|
362 |
+
out = self.conv1(out)
|
363 |
+
if self.bn==True:
|
364 |
+
out = self.bn1(out)
|
365 |
+
|
366 |
+
out = self.activation(out)
|
367 |
+
out = self.conv2(out)
|
368 |
+
if self.bn==True:
|
369 |
+
out = self.bn2(out)
|
370 |
+
|
371 |
+
if self.groups > 1:
|
372 |
+
out = self.conv_merge(out)
|
373 |
+
|
374 |
+
return self.skip_add.add(out, x)
|
375 |
+
|
376 |
+
# return out + x
|
377 |
+
|
378 |
+
|
379 |
+
class FeatureFusionBlock_custom(nn.Module):
|
380 |
+
"""Feature fusion block.
|
381 |
+
"""
|
382 |
+
|
383 |
+
def __init__(self, features, activation, deconv=False, bn=False, expand=False, align_corners=True, size=None):
|
384 |
+
"""Init.
|
385 |
+
|
386 |
+
Args:
|
387 |
+
features (int): number of features
|
388 |
+
"""
|
389 |
+
super(FeatureFusionBlock_custom, self).__init__()
|
390 |
+
|
391 |
+
self.deconv = deconv
|
392 |
+
self.align_corners = align_corners
|
393 |
+
|
394 |
+
self.groups=1
|
395 |
+
|
396 |
+
self.expand = expand
|
397 |
+
out_features = features
|
398 |
+
if self.expand==True:
|
399 |
+
out_features = features//2
|
400 |
+
|
401 |
+
self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)
|
402 |
+
|
403 |
+
self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn)
|
404 |
+
self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn)
|
405 |
+
|
406 |
+
self.skip_add = nn.quantized.FloatFunctional()
|
407 |
+
|
408 |
+
self.size=size
|
409 |
+
|
410 |
+
def forward(self, *xs, size=None):
|
411 |
+
"""Forward pass.
|
412 |
+
|
413 |
+
Returns:
|
414 |
+
tensor: output
|
415 |
+
"""
|
416 |
+
output = xs[0]
|
417 |
+
|
418 |
+
if len(xs) == 2:
|
419 |
+
res = self.resConfUnit1(xs[1])
|
420 |
+
output = self.skip_add.add(output, res)
|
421 |
+
# output += res
|
422 |
+
|
423 |
+
output = self.resConfUnit2(output)
|
424 |
+
|
425 |
+
if (size is None) and (self.size is None):
|
426 |
+
modifier = {"scale_factor": 2}
|
427 |
+
elif size is None:
|
428 |
+
modifier = {"size": self.size}
|
429 |
+
else:
|
430 |
+
modifier = {"size": size}
|
431 |
+
|
432 |
+
output = nn.functional.interpolate(
|
433 |
+
output, **modifier, mode="bilinear", align_corners=self.align_corners
|
434 |
+
)
|
435 |
+
|
436 |
+
output = self.out_conv(output)
|
437 |
+
|
438 |
+
return output
|
439 |
+
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/dpt_depth.py
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
|
4 |
+
from .base_model import BaseModel
|
5 |
+
from .blocks import (
|
6 |
+
FeatureFusionBlock_custom,
|
7 |
+
Interpolate,
|
8 |
+
_make_encoder,
|
9 |
+
forward_beit,
|
10 |
+
forward_swin,
|
11 |
+
forward_levit,
|
12 |
+
forward_vit,
|
13 |
+
)
|
14 |
+
from .backbones.levit import stem_b4_transpose
|
15 |
+
from timm.models.layers import get_act_layer
|
16 |
+
|
17 |
+
|
18 |
+
def _make_fusion_block(features, use_bn, size = None):
|
19 |
+
return FeatureFusionBlock_custom(
|
20 |
+
features,
|
21 |
+
nn.ReLU(False),
|
22 |
+
deconv=False,
|
23 |
+
bn=use_bn,
|
24 |
+
expand=False,
|
25 |
+
align_corners=True,
|
26 |
+
size=size,
|
27 |
+
)
|
28 |
+
|
29 |
+
|
30 |
+
class DPT(BaseModel):
|
31 |
+
def __init__(
|
32 |
+
self,
|
33 |
+
head,
|
34 |
+
features=256,
|
35 |
+
backbone="vitb_rn50_384",
|
36 |
+
readout="project",
|
37 |
+
channels_last=False,
|
38 |
+
use_bn=False,
|
39 |
+
**kwargs
|
40 |
+
):
|
41 |
+
|
42 |
+
super(DPT, self).__init__()
|
43 |
+
|
44 |
+
self.channels_last = channels_last
|
45 |
+
|
46 |
+
# For the Swin, Swin 2, LeViT and Next-ViT Transformers, the hierarchical architectures prevent setting the
|
47 |
+
# hooks freely. Instead, the hooks have to be chosen according to the ranges specified in the comments.
|
48 |
+
hooks = {
|
49 |
+
"beitl16_512": [5, 11, 17, 23],
|
50 |
+
"beitl16_384": [5, 11, 17, 23],
|
51 |
+
"beitb16_384": [2, 5, 8, 11],
|
52 |
+
"swin2l24_384": [1, 1, 17, 1], # Allowed ranges: [0, 1], [0, 1], [ 0, 17], [ 0, 1]
|
53 |
+
"swin2b24_384": [1, 1, 17, 1], # [0, 1], [0, 1], [ 0, 17], [ 0, 1]
|
54 |
+
"swin2t16_256": [1, 1, 5, 1], # [0, 1], [0, 1], [ 0, 5], [ 0, 1]
|
55 |
+
"swinl12_384": [1, 1, 17, 1], # [0, 1], [0, 1], [ 0, 17], [ 0, 1]
|
56 |
+
"next_vit_large_6m": [2, 6, 36, 39], # [0, 2], [3, 6], [ 7, 36], [37, 39]
|
57 |
+
"levit_384": [3, 11, 21], # [0, 3], [6, 11], [14, 21]
|
58 |
+
"vitb_rn50_384": [0, 1, 8, 11],
|
59 |
+
"vitb16_384": [2, 5, 8, 11],
|
60 |
+
"vitl16_384": [5, 11, 17, 23],
|
61 |
+
}[backbone]
|
62 |
+
|
63 |
+
if "next_vit" in backbone:
|
64 |
+
in_features = {
|
65 |
+
"next_vit_large_6m": [96, 256, 512, 1024],
|
66 |
+
}[backbone]
|
67 |
+
else:
|
68 |
+
in_features = None
|
69 |
+
|
70 |
+
# Instantiate backbone and reassemble blocks
|
71 |
+
self.pretrained, self.scratch = _make_encoder(
|
72 |
+
backbone,
|
73 |
+
features,
|
74 |
+
False, # Set to true of you want to train from scratch, uses ImageNet weights
|
75 |
+
groups=1,
|
76 |
+
expand=False,
|
77 |
+
exportable=False,
|
78 |
+
hooks=hooks,
|
79 |
+
use_readout=readout,
|
80 |
+
in_features=in_features,
|
81 |
+
)
|
82 |
+
|
83 |
+
self.number_layers = len(hooks) if hooks is not None else 4
|
84 |
+
size_refinenet3 = None
|
85 |
+
self.scratch.stem_transpose = None
|
86 |
+
|
87 |
+
if "beit" in backbone:
|
88 |
+
self.forward_transformer = forward_beit
|
89 |
+
elif "swin" in backbone:
|
90 |
+
self.forward_transformer = forward_swin
|
91 |
+
elif "next_vit" in backbone:
|
92 |
+
from .backbones.next_vit import forward_next_vit
|
93 |
+
self.forward_transformer = forward_next_vit
|
94 |
+
elif "levit" in backbone:
|
95 |
+
self.forward_transformer = forward_levit
|
96 |
+
size_refinenet3 = 7
|
97 |
+
self.scratch.stem_transpose = stem_b4_transpose(256, 128, get_act_layer("hard_swish"))
|
98 |
+
else:
|
99 |
+
self.forward_transformer = forward_vit
|
100 |
+
|
101 |
+
self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
|
102 |
+
self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
|
103 |
+
self.scratch.refinenet3 = _make_fusion_block(features, use_bn, size_refinenet3)
|
104 |
+
if self.number_layers >= 4:
|
105 |
+
self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
|
106 |
+
|
107 |
+
self.scratch.output_conv = head
|
108 |
+
|
109 |
+
|
110 |
+
def forward(self, x):
|
111 |
+
if self.channels_last == True:
|
112 |
+
x.contiguous(memory_format=torch.channels_last)
|
113 |
+
|
114 |
+
layers = self.forward_transformer(self.pretrained, x)
|
115 |
+
if self.number_layers == 3:
|
116 |
+
layer_1, layer_2, layer_3 = layers
|
117 |
+
else:
|
118 |
+
layer_1, layer_2, layer_3, layer_4 = layers
|
119 |
+
|
120 |
+
layer_1_rn = self.scratch.layer1_rn(layer_1)
|
121 |
+
layer_2_rn = self.scratch.layer2_rn(layer_2)
|
122 |
+
layer_3_rn = self.scratch.layer3_rn(layer_3)
|
123 |
+
if self.number_layers >= 4:
|
124 |
+
layer_4_rn = self.scratch.layer4_rn(layer_4)
|
125 |
+
|
126 |
+
if self.number_layers == 3:
|
127 |
+
path_3 = self.scratch.refinenet3(layer_3_rn, size=layer_2_rn.shape[2:])
|
128 |
+
else:
|
129 |
+
path_4 = self.scratch.refinenet4(layer_4_rn, size=layer_3_rn.shape[2:])
|
130 |
+
path_3 = self.scratch.refinenet3(path_4, layer_3_rn, size=layer_2_rn.shape[2:])
|
131 |
+
path_2 = self.scratch.refinenet2(path_3, layer_2_rn, size=layer_1_rn.shape[2:])
|
132 |
+
path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
|
133 |
+
|
134 |
+
if self.scratch.stem_transpose is not None:
|
135 |
+
path_1 = self.scratch.stem_transpose(path_1)
|
136 |
+
|
137 |
+
out = self.scratch.output_conv(path_1)
|
138 |
+
|
139 |
+
return out
|
140 |
+
|
141 |
+
|
142 |
+
class DPTDepthModel(DPT):
|
143 |
+
def __init__(self, path=None, non_negative=True, **kwargs):
|
144 |
+
features = kwargs["features"] if "features" in kwargs else 256
|
145 |
+
head_features_1 = kwargs["head_features_1"] if "head_features_1" in kwargs else features
|
146 |
+
head_features_2 = kwargs["head_features_2"] if "head_features_2" in kwargs else 32
|
147 |
+
kwargs.pop("head_features_1", None)
|
148 |
+
kwargs.pop("head_features_2", None)
|
149 |
+
|
150 |
+
head = nn.Sequential(
|
151 |
+
nn.Conv2d(head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1),
|
152 |
+
Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
|
153 |
+
nn.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1),
|
154 |
+
nn.ReLU(True),
|
155 |
+
nn.Conv2d(head_features_2, 1, kernel_size=1, stride=1, padding=0),
|
156 |
+
nn.ReLU(True) if non_negative else nn.Identity(),
|
157 |
+
nn.Identity(),
|
158 |
+
)
|
159 |
+
|
160 |
+
super().__init__(head, **kwargs)
|
161 |
+
|
162 |
+
if path is not None:
|
163 |
+
self.load(path)
|
164 |
+
|
165 |
+
def forward(self, x):
|
166 |
+
return super().forward(x).squeeze(dim=1)
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/midas_net.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""MidashNet: Network for monocular depth estimation trained by mixing several datasets.
|
2 |
+
This file contains code that is adapted from
|
3 |
+
https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py
|
4 |
+
"""
|
5 |
+
import torch
|
6 |
+
import torch.nn as nn
|
7 |
+
|
8 |
+
from .base_model import BaseModel
|
9 |
+
from .blocks import FeatureFusionBlock, Interpolate, _make_encoder
|
10 |
+
|
11 |
+
|
12 |
+
class MidasNet(BaseModel):
|
13 |
+
"""Network for monocular depth estimation.
|
14 |
+
"""
|
15 |
+
|
16 |
+
def __init__(self, path=None, features=256, non_negative=True):
|
17 |
+
"""Init.
|
18 |
+
|
19 |
+
Args:
|
20 |
+
path (str, optional): Path to saved model. Defaults to None.
|
21 |
+
features (int, optional): Number of features. Defaults to 256.
|
22 |
+
backbone (str, optional): Backbone network for encoder. Defaults to resnet50
|
23 |
+
"""
|
24 |
+
print("Loading weights: ", path)
|
25 |
+
|
26 |
+
super(MidasNet, self).__init__()
|
27 |
+
|
28 |
+
use_pretrained = False if path is None else True
|
29 |
+
|
30 |
+
self.pretrained, self.scratch = _make_encoder(backbone="resnext101_wsl", features=features, use_pretrained=use_pretrained)
|
31 |
+
|
32 |
+
self.scratch.refinenet4 = FeatureFusionBlock(features)
|
33 |
+
self.scratch.refinenet3 = FeatureFusionBlock(features)
|
34 |
+
self.scratch.refinenet2 = FeatureFusionBlock(features)
|
35 |
+
self.scratch.refinenet1 = FeatureFusionBlock(features)
|
36 |
+
|
37 |
+
self.scratch.output_conv = nn.Sequential(
|
38 |
+
nn.Conv2d(features, 128, kernel_size=3, stride=1, padding=1),
|
39 |
+
Interpolate(scale_factor=2, mode="bilinear"),
|
40 |
+
nn.Conv2d(128, 32, kernel_size=3, stride=1, padding=1),
|
41 |
+
nn.ReLU(True),
|
42 |
+
nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
|
43 |
+
nn.ReLU(True) if non_negative else nn.Identity(),
|
44 |
+
)
|
45 |
+
|
46 |
+
if path:
|
47 |
+
self.load(path)
|
48 |
+
|
49 |
+
def forward(self, x):
|
50 |
+
"""Forward pass.
|
51 |
+
|
52 |
+
Args:
|
53 |
+
x (tensor): input data (image)
|
54 |
+
|
55 |
+
Returns:
|
56 |
+
tensor: depth
|
57 |
+
"""
|
58 |
+
|
59 |
+
layer_1 = self.pretrained.layer1(x)
|
60 |
+
layer_2 = self.pretrained.layer2(layer_1)
|
61 |
+
layer_3 = self.pretrained.layer3(layer_2)
|
62 |
+
layer_4 = self.pretrained.layer4(layer_3)
|
63 |
+
|
64 |
+
layer_1_rn = self.scratch.layer1_rn(layer_1)
|
65 |
+
layer_2_rn = self.scratch.layer2_rn(layer_2)
|
66 |
+
layer_3_rn = self.scratch.layer3_rn(layer_3)
|
67 |
+
layer_4_rn = self.scratch.layer4_rn(layer_4)
|
68 |
+
|
69 |
+
path_4 = self.scratch.refinenet4(layer_4_rn)
|
70 |
+
path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
|
71 |
+
path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
|
72 |
+
path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
|
73 |
+
|
74 |
+
out = self.scratch.output_conv(path_1)
|
75 |
+
|
76 |
+
return torch.squeeze(out, dim=1)
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/midas_net_custom.py
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""MidashNet: Network for monocular depth estimation trained by mixing several datasets.
|
2 |
+
This file contains code that is adapted from
|
3 |
+
https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py
|
4 |
+
"""
|
5 |
+
import torch
|
6 |
+
import torch.nn as nn
|
7 |
+
|
8 |
+
from .base_model import BaseModel
|
9 |
+
from .blocks import FeatureFusionBlock, FeatureFusionBlock_custom, Interpolate, _make_encoder
|
10 |
+
|
11 |
+
|
12 |
+
class MidasNet_small(BaseModel):
|
13 |
+
"""Network for monocular depth estimation.
|
14 |
+
"""
|
15 |
+
|
16 |
+
def __init__(self, path=None, features=64, backbone="efficientnet_lite3", non_negative=True, exportable=True, channels_last=False, align_corners=True,
|
17 |
+
blocks={'expand': True}):
|
18 |
+
"""Init.
|
19 |
+
|
20 |
+
Args:
|
21 |
+
path (str, optional): Path to saved model. Defaults to None.
|
22 |
+
features (int, optional): Number of features. Defaults to 256.
|
23 |
+
backbone (str, optional): Backbone network for encoder. Defaults to resnet50
|
24 |
+
"""
|
25 |
+
print("Loading weights: ", path)
|
26 |
+
|
27 |
+
super(MidasNet_small, self).__init__()
|
28 |
+
|
29 |
+
use_pretrained = False if path else True
|
30 |
+
|
31 |
+
self.channels_last = channels_last
|
32 |
+
self.blocks = blocks
|
33 |
+
self.backbone = backbone
|
34 |
+
|
35 |
+
self.groups = 1
|
36 |
+
|
37 |
+
features1=features
|
38 |
+
features2=features
|
39 |
+
features3=features
|
40 |
+
features4=features
|
41 |
+
self.expand = False
|
42 |
+
if "expand" in self.blocks and self.blocks['expand'] == True:
|
43 |
+
self.expand = True
|
44 |
+
features1=features
|
45 |
+
features2=features*2
|
46 |
+
features3=features*4
|
47 |
+
features4=features*8
|
48 |
+
|
49 |
+
self.pretrained, self.scratch = _make_encoder(self.backbone, features, use_pretrained, groups=self.groups, expand=self.expand, exportable=exportable)
|
50 |
+
|
51 |
+
self.scratch.activation = nn.ReLU(False)
|
52 |
+
|
53 |
+
self.scratch.refinenet4 = FeatureFusionBlock_custom(features4, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners)
|
54 |
+
self.scratch.refinenet3 = FeatureFusionBlock_custom(features3, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners)
|
55 |
+
self.scratch.refinenet2 = FeatureFusionBlock_custom(features2, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners)
|
56 |
+
self.scratch.refinenet1 = FeatureFusionBlock_custom(features1, self.scratch.activation, deconv=False, bn=False, align_corners=align_corners)
|
57 |
+
|
58 |
+
|
59 |
+
self.scratch.output_conv = nn.Sequential(
|
60 |
+
nn.Conv2d(features, features//2, kernel_size=3, stride=1, padding=1, groups=self.groups),
|
61 |
+
Interpolate(scale_factor=2, mode="bilinear"),
|
62 |
+
nn.Conv2d(features//2, 32, kernel_size=3, stride=1, padding=1),
|
63 |
+
self.scratch.activation,
|
64 |
+
nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
|
65 |
+
nn.ReLU(True) if non_negative else nn.Identity(),
|
66 |
+
nn.Identity(),
|
67 |
+
)
|
68 |
+
|
69 |
+
if path:
|
70 |
+
self.load(path)
|
71 |
+
|
72 |
+
|
73 |
+
def forward(self, x):
|
74 |
+
"""Forward pass.
|
75 |
+
|
76 |
+
Args:
|
77 |
+
x (tensor): input data (image)
|
78 |
+
|
79 |
+
Returns:
|
80 |
+
tensor: depth
|
81 |
+
"""
|
82 |
+
if self.channels_last==True:
|
83 |
+
print("self.channels_last = ", self.channels_last)
|
84 |
+
x.contiguous(memory_format=torch.channels_last)
|
85 |
+
|
86 |
+
|
87 |
+
layer_1 = self.pretrained.layer1(x)
|
88 |
+
layer_2 = self.pretrained.layer2(layer_1)
|
89 |
+
layer_3 = self.pretrained.layer3(layer_2)
|
90 |
+
layer_4 = self.pretrained.layer4(layer_3)
|
91 |
+
|
92 |
+
layer_1_rn = self.scratch.layer1_rn(layer_1)
|
93 |
+
layer_2_rn = self.scratch.layer2_rn(layer_2)
|
94 |
+
layer_3_rn = self.scratch.layer3_rn(layer_3)
|
95 |
+
layer_4_rn = self.scratch.layer4_rn(layer_4)
|
96 |
+
|
97 |
+
|
98 |
+
path_4 = self.scratch.refinenet4(layer_4_rn)
|
99 |
+
path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
|
100 |
+
path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
|
101 |
+
path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
|
102 |
+
|
103 |
+
out = self.scratch.output_conv(path_1)
|
104 |
+
|
105 |
+
return torch.squeeze(out, dim=1)
|
106 |
+
|
107 |
+
|
108 |
+
|
109 |
+
def fuse_model(m):
|
110 |
+
prev_previous_type = nn.Identity()
|
111 |
+
prev_previous_name = ''
|
112 |
+
previous_type = nn.Identity()
|
113 |
+
previous_name = ''
|
114 |
+
for name, module in m.named_modules():
|
115 |
+
if prev_previous_type == nn.Conv2d and previous_type == nn.BatchNorm2d and type(module) == nn.ReLU:
|
116 |
+
# print("FUSED ", prev_previous_name, previous_name, name)
|
117 |
+
torch.quantization.fuse_modules(m, [prev_previous_name, previous_name, name], inplace=True)
|
118 |
+
elif prev_previous_type == nn.Conv2d and previous_type == nn.BatchNorm2d:
|
119 |
+
# print("FUSED ", prev_previous_name, previous_name)
|
120 |
+
torch.quantization.fuse_modules(m, [prev_previous_name, previous_name], inplace=True)
|
121 |
+
# elif previous_type == nn.Conv2d and type(module) == nn.ReLU:
|
122 |
+
# print("FUSED ", previous_name, name)
|
123 |
+
# torch.quantization.fuse_modules(m, [previous_name, name], inplace=True)
|
124 |
+
|
125 |
+
prev_previous_type = previous_type
|
126 |
+
prev_previous_name = previous_name
|
127 |
+
previous_type = type(module)
|
128 |
+
previous_name = name
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/model_loader.py
ADDED
@@ -0,0 +1,242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import cv2
|
2 |
+
import torch
|
3 |
+
|
4 |
+
from midas.dpt_depth import DPTDepthModel
|
5 |
+
from midas.midas_net import MidasNet
|
6 |
+
from midas.midas_net_custom import MidasNet_small
|
7 |
+
from midas.transforms import Resize, NormalizeImage, PrepareForNet
|
8 |
+
|
9 |
+
from torchvision.transforms import Compose
|
10 |
+
|
11 |
+
default_models = {
|
12 |
+
"dpt_beit_large_512": "weights/dpt_beit_large_512.pt",
|
13 |
+
"dpt_beit_large_384": "weights/dpt_beit_large_384.pt",
|
14 |
+
"dpt_beit_base_384": "weights/dpt_beit_base_384.pt",
|
15 |
+
"dpt_swin2_large_384": "weights/dpt_swin2_large_384.pt",
|
16 |
+
"dpt_swin2_base_384": "weights/dpt_swin2_base_384.pt",
|
17 |
+
"dpt_swin2_tiny_256": "weights/dpt_swin2_tiny_256.pt",
|
18 |
+
"dpt_swin_large_384": "weights/dpt_swin_large_384.pt",
|
19 |
+
"dpt_next_vit_large_384": "weights/dpt_next_vit_large_384.pt",
|
20 |
+
"dpt_levit_224": "weights/dpt_levit_224.pt",
|
21 |
+
"dpt_large_384": "weights/dpt_large_384.pt",
|
22 |
+
"dpt_hybrid_384": "weights/dpt_hybrid_384.pt",
|
23 |
+
"midas_v21_384": "weights/midas_v21_384.pt",
|
24 |
+
"midas_v21_small_256": "weights/midas_v21_small_256.pt",
|
25 |
+
"openvino_midas_v21_small_256": "weights/openvino_midas_v21_small_256.xml",
|
26 |
+
}
|
27 |
+
|
28 |
+
|
29 |
+
def load_model(device, model_path, model_type="dpt_large_384", optimize=True, height=None, square=False):
|
30 |
+
"""Load the specified network.
|
31 |
+
|
32 |
+
Args:
|
33 |
+
device (device): the torch device used
|
34 |
+
model_path (str): path to saved model
|
35 |
+
model_type (str): the type of the model to be loaded
|
36 |
+
optimize (bool): optimize the model to half-integer on CUDA?
|
37 |
+
height (int): inference encoder image height
|
38 |
+
square (bool): resize to a square resolution?
|
39 |
+
|
40 |
+
Returns:
|
41 |
+
The loaded network, the transform which prepares images as input to the network and the dimensions of the
|
42 |
+
network input
|
43 |
+
"""
|
44 |
+
if "openvino" in model_type:
|
45 |
+
from openvino.runtime import Core
|
46 |
+
|
47 |
+
keep_aspect_ratio = not square
|
48 |
+
|
49 |
+
if model_type == "dpt_beit_large_512":
|
50 |
+
model = DPTDepthModel(
|
51 |
+
path=model_path,
|
52 |
+
backbone="beitl16_512",
|
53 |
+
non_negative=True,
|
54 |
+
)
|
55 |
+
net_w, net_h = 512, 512
|
56 |
+
resize_mode = "minimal"
|
57 |
+
normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
|
58 |
+
|
59 |
+
elif model_type == "dpt_beit_large_384":
|
60 |
+
model = DPTDepthModel(
|
61 |
+
path=model_path,
|
62 |
+
backbone="beitl16_384",
|
63 |
+
non_negative=True,
|
64 |
+
)
|
65 |
+
net_w, net_h = 384, 384
|
66 |
+
resize_mode = "minimal"
|
67 |
+
normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
|
68 |
+
|
69 |
+
elif model_type == "dpt_beit_base_384":
|
70 |
+
model = DPTDepthModel(
|
71 |
+
path=model_path,
|
72 |
+
backbone="beitb16_384",
|
73 |
+
non_negative=True,
|
74 |
+
)
|
75 |
+
net_w, net_h = 384, 384
|
76 |
+
resize_mode = "minimal"
|
77 |
+
normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
|
78 |
+
|
79 |
+
elif model_type == "dpt_swin2_large_384":
|
80 |
+
model = DPTDepthModel(
|
81 |
+
path=model_path,
|
82 |
+
backbone="swin2l24_384",
|
83 |
+
non_negative=True,
|
84 |
+
)
|
85 |
+
net_w, net_h = 384, 384
|
86 |
+
keep_aspect_ratio = False
|
87 |
+
resize_mode = "minimal"
|
88 |
+
normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
|
89 |
+
|
90 |
+
elif model_type == "dpt_swin2_base_384":
|
91 |
+
model = DPTDepthModel(
|
92 |
+
path=model_path,
|
93 |
+
backbone="swin2b24_384",
|
94 |
+
non_negative=True,
|
95 |
+
)
|
96 |
+
net_w, net_h = 384, 384
|
97 |
+
keep_aspect_ratio = False
|
98 |
+
resize_mode = "minimal"
|
99 |
+
normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
|
100 |
+
|
101 |
+
elif model_type == "dpt_swin2_tiny_256":
|
102 |
+
model = DPTDepthModel(
|
103 |
+
path=model_path,
|
104 |
+
backbone="swin2t16_256",
|
105 |
+
non_negative=True,
|
106 |
+
)
|
107 |
+
net_w, net_h = 256, 256
|
108 |
+
keep_aspect_ratio = False
|
109 |
+
resize_mode = "minimal"
|
110 |
+
normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
|
111 |
+
|
112 |
+
elif model_type == "dpt_swin_large_384":
|
113 |
+
model = DPTDepthModel(
|
114 |
+
path=model_path,
|
115 |
+
backbone="swinl12_384",
|
116 |
+
non_negative=True,
|
117 |
+
)
|
118 |
+
net_w, net_h = 384, 384
|
119 |
+
keep_aspect_ratio = False
|
120 |
+
resize_mode = "minimal"
|
121 |
+
normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
|
122 |
+
|
123 |
+
elif model_type == "dpt_next_vit_large_384":
|
124 |
+
model = DPTDepthModel(
|
125 |
+
path=model_path,
|
126 |
+
backbone="next_vit_large_6m",
|
127 |
+
non_negative=True,
|
128 |
+
)
|
129 |
+
net_w, net_h = 384, 384
|
130 |
+
resize_mode = "minimal"
|
131 |
+
normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
|
132 |
+
|
133 |
+
# We change the notation from dpt_levit_224 (MiDaS notation) to levit_384 (timm notation) here, where the 224 refers
|
134 |
+
# to the resolution 224x224 used by LeViT and 384 is the first entry of the embed_dim, see _cfg and model_cfgs of
|
135 |
+
# https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/levit.py
|
136 |
+
# (commit id: 927f031293a30afb940fff0bee34b85d9c059b0e)
|
137 |
+
elif model_type == "dpt_levit_224":
|
138 |
+
model = DPTDepthModel(
|
139 |
+
path=model_path,
|
140 |
+
backbone="levit_384",
|
141 |
+
non_negative=True,
|
142 |
+
head_features_1=64,
|
143 |
+
head_features_2=8,
|
144 |
+
)
|
145 |
+
net_w, net_h = 224, 224
|
146 |
+
keep_aspect_ratio = False
|
147 |
+
resize_mode = "minimal"
|
148 |
+
normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
|
149 |
+
|
150 |
+
elif model_type == "dpt_large_384":
|
151 |
+
model = DPTDepthModel(
|
152 |
+
path=model_path,
|
153 |
+
backbone="vitl16_384",
|
154 |
+
non_negative=True,
|
155 |
+
)
|
156 |
+
net_w, net_h = 384, 384
|
157 |
+
resize_mode = "minimal"
|
158 |
+
normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
|
159 |
+
|
160 |
+
elif model_type == "dpt_hybrid_384":
|
161 |
+
model = DPTDepthModel(
|
162 |
+
path=model_path,
|
163 |
+
backbone="vitb_rn50_384",
|
164 |
+
non_negative=True,
|
165 |
+
)
|
166 |
+
net_w, net_h = 384, 384
|
167 |
+
resize_mode = "minimal"
|
168 |
+
normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
|
169 |
+
|
170 |
+
elif model_type == "midas_v21_384":
|
171 |
+
model = MidasNet(model_path, non_negative=True)
|
172 |
+
net_w, net_h = 384, 384
|
173 |
+
resize_mode = "upper_bound"
|
174 |
+
normalization = NormalizeImage(
|
175 |
+
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
|
176 |
+
)
|
177 |
+
|
178 |
+
elif model_type == "midas_v21_small_256":
|
179 |
+
model = MidasNet_small(model_path, features=64, backbone="efficientnet_lite3", exportable=True,
|
180 |
+
non_negative=True, blocks={'expand': True})
|
181 |
+
net_w, net_h = 256, 256
|
182 |
+
resize_mode = "upper_bound"
|
183 |
+
normalization = NormalizeImage(
|
184 |
+
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
|
185 |
+
)
|
186 |
+
|
187 |
+
elif model_type == "openvino_midas_v21_small_256":
|
188 |
+
ie = Core()
|
189 |
+
uncompiled_model = ie.read_model(model=model_path)
|
190 |
+
model = ie.compile_model(uncompiled_model, "CPU")
|
191 |
+
net_w, net_h = 256, 256
|
192 |
+
resize_mode = "upper_bound"
|
193 |
+
normalization = NormalizeImage(
|
194 |
+
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
|
195 |
+
)
|
196 |
+
|
197 |
+
else:
|
198 |
+
print(f"model_type '{model_type}' not implemented, use: --model_type large")
|
199 |
+
assert False
|
200 |
+
|
201 |
+
if not "openvino" in model_type:
|
202 |
+
print("Model loaded, number of parameters = {:.0f}M".format(sum(p.numel() for p in model.parameters()) / 1e6))
|
203 |
+
else:
|
204 |
+
print("Model loaded, optimized with OpenVINO")
|
205 |
+
|
206 |
+
if "openvino" in model_type:
|
207 |
+
keep_aspect_ratio = False
|
208 |
+
|
209 |
+
if height is not None:
|
210 |
+
net_w, net_h = height, height
|
211 |
+
|
212 |
+
transform = Compose(
|
213 |
+
[
|
214 |
+
Resize(
|
215 |
+
net_w,
|
216 |
+
net_h,
|
217 |
+
resize_target=None,
|
218 |
+
keep_aspect_ratio=keep_aspect_ratio,
|
219 |
+
ensure_multiple_of=32,
|
220 |
+
resize_method=resize_mode,
|
221 |
+
image_interpolation_method=cv2.INTER_CUBIC,
|
222 |
+
),
|
223 |
+
normalization,
|
224 |
+
PrepareForNet(),
|
225 |
+
]
|
226 |
+
)
|
227 |
+
|
228 |
+
if not "openvino" in model_type:
|
229 |
+
model.eval()
|
230 |
+
|
231 |
+
if optimize and (device == torch.device("cuda")):
|
232 |
+
if not "openvino" in model_type:
|
233 |
+
model = model.to(memory_format=torch.channels_last)
|
234 |
+
model = model.half()
|
235 |
+
else:
|
236 |
+
print("Error: OpenVINO models are already optimized. No optimization to half-float possible.")
|
237 |
+
exit()
|
238 |
+
|
239 |
+
if not "openvino" in model_type:
|
240 |
+
model.to(device)
|
241 |
+
|
242 |
+
return model, transform, net_w, net_h
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/transforms.py
ADDED
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import cv2
|
3 |
+
import math
|
4 |
+
|
5 |
+
|
6 |
+
def apply_min_size(sample, size, image_interpolation_method=cv2.INTER_AREA):
|
7 |
+
"""Rezise the sample to ensure the given size. Keeps aspect ratio.
|
8 |
+
|
9 |
+
Args:
|
10 |
+
sample (dict): sample
|
11 |
+
size (tuple): image size
|
12 |
+
|
13 |
+
Returns:
|
14 |
+
tuple: new size
|
15 |
+
"""
|
16 |
+
shape = list(sample["disparity"].shape)
|
17 |
+
|
18 |
+
if shape[0] >= size[0] and shape[1] >= size[1]:
|
19 |
+
return sample
|
20 |
+
|
21 |
+
scale = [0, 0]
|
22 |
+
scale[0] = size[0] / shape[0]
|
23 |
+
scale[1] = size[1] / shape[1]
|
24 |
+
|
25 |
+
scale = max(scale)
|
26 |
+
|
27 |
+
shape[0] = math.ceil(scale * shape[0])
|
28 |
+
shape[1] = math.ceil(scale * shape[1])
|
29 |
+
|
30 |
+
# resize
|
31 |
+
sample["image"] = cv2.resize(
|
32 |
+
sample["image"], tuple(shape[::-1]), interpolation=image_interpolation_method
|
33 |
+
)
|
34 |
+
|
35 |
+
sample["disparity"] = cv2.resize(
|
36 |
+
sample["disparity"], tuple(shape[::-1]), interpolation=cv2.INTER_NEAREST
|
37 |
+
)
|
38 |
+
sample["mask"] = cv2.resize(
|
39 |
+
sample["mask"].astype(np.float32),
|
40 |
+
tuple(shape[::-1]),
|
41 |
+
interpolation=cv2.INTER_NEAREST,
|
42 |
+
)
|
43 |
+
sample["mask"] = sample["mask"].astype(bool)
|
44 |
+
|
45 |
+
return tuple(shape)
|
46 |
+
|
47 |
+
|
48 |
+
class Resize(object):
|
49 |
+
"""Resize sample to given size (width, height).
|
50 |
+
"""
|
51 |
+
|
52 |
+
def __init__(
|
53 |
+
self,
|
54 |
+
width,
|
55 |
+
height,
|
56 |
+
resize_target=True,
|
57 |
+
keep_aspect_ratio=False,
|
58 |
+
ensure_multiple_of=1,
|
59 |
+
resize_method="lower_bound",
|
60 |
+
image_interpolation_method=cv2.INTER_AREA,
|
61 |
+
):
|
62 |
+
"""Init.
|
63 |
+
|
64 |
+
Args:
|
65 |
+
width (int): desired output width
|
66 |
+
height (int): desired output height
|
67 |
+
resize_target (bool, optional):
|
68 |
+
True: Resize the full sample (image, mask, target).
|
69 |
+
False: Resize image only.
|
70 |
+
Defaults to True.
|
71 |
+
keep_aspect_ratio (bool, optional):
|
72 |
+
True: Keep the aspect ratio of the input sample.
|
73 |
+
Output sample might not have the given width and height, and
|
74 |
+
resize behaviour depends on the parameter 'resize_method'.
|
75 |
+
Defaults to False.
|
76 |
+
ensure_multiple_of (int, optional):
|
77 |
+
Output width and height is constrained to be multiple of this parameter.
|
78 |
+
Defaults to 1.
|
79 |
+
resize_method (str, optional):
|
80 |
+
"lower_bound": Output will be at least as large as the given size.
|
81 |
+
"upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
|
82 |
+
"minimal": Scale as least as possible. (Output size might be smaller than given size.)
|
83 |
+
Defaults to "lower_bound".
|
84 |
+
"""
|
85 |
+
self.__width = width
|
86 |
+
self.__height = height
|
87 |
+
|
88 |
+
self.__resize_target = resize_target
|
89 |
+
self.__keep_aspect_ratio = keep_aspect_ratio
|
90 |
+
self.__multiple_of = ensure_multiple_of
|
91 |
+
self.__resize_method = resize_method
|
92 |
+
self.__image_interpolation_method = image_interpolation_method
|
93 |
+
|
94 |
+
def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
|
95 |
+
y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
|
96 |
+
|
97 |
+
if max_val is not None and y > max_val:
|
98 |
+
y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int)
|
99 |
+
|
100 |
+
if y < min_val:
|
101 |
+
y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int)
|
102 |
+
|
103 |
+
return y
|
104 |
+
|
105 |
+
def get_size(self, width, height):
|
106 |
+
# determine new height and width
|
107 |
+
scale_height = self.__height / height
|
108 |
+
scale_width = self.__width / width
|
109 |
+
|
110 |
+
if self.__keep_aspect_ratio:
|
111 |
+
if self.__resize_method == "lower_bound":
|
112 |
+
# scale such that output size is lower bound
|
113 |
+
if scale_width > scale_height:
|
114 |
+
# fit width
|
115 |
+
scale_height = scale_width
|
116 |
+
else:
|
117 |
+
# fit height
|
118 |
+
scale_width = scale_height
|
119 |
+
elif self.__resize_method == "upper_bound":
|
120 |
+
# scale such that output size is upper bound
|
121 |
+
if scale_width < scale_height:
|
122 |
+
# fit width
|
123 |
+
scale_height = scale_width
|
124 |
+
else:
|
125 |
+
# fit height
|
126 |
+
scale_width = scale_height
|
127 |
+
elif self.__resize_method == "minimal":
|
128 |
+
# scale as least as possbile
|
129 |
+
if abs(1 - scale_width) < abs(1 - scale_height):
|
130 |
+
# fit width
|
131 |
+
scale_height = scale_width
|
132 |
+
else:
|
133 |
+
# fit height
|
134 |
+
scale_width = scale_height
|
135 |
+
else:
|
136 |
+
raise ValueError(
|
137 |
+
f"resize_method {self.__resize_method} not implemented"
|
138 |
+
)
|
139 |
+
|
140 |
+
if self.__resize_method == "lower_bound":
|
141 |
+
new_height = self.constrain_to_multiple_of(
|
142 |
+
scale_height * height, min_val=self.__height
|
143 |
+
)
|
144 |
+
new_width = self.constrain_to_multiple_of(
|
145 |
+
scale_width * width, min_val=self.__width
|
146 |
+
)
|
147 |
+
elif self.__resize_method == "upper_bound":
|
148 |
+
new_height = self.constrain_to_multiple_of(
|
149 |
+
scale_height * height, max_val=self.__height
|
150 |
+
)
|
151 |
+
new_width = self.constrain_to_multiple_of(
|
152 |
+
scale_width * width, max_val=self.__width
|
153 |
+
)
|
154 |
+
elif self.__resize_method == "minimal":
|
155 |
+
new_height = self.constrain_to_multiple_of(scale_height * height)
|
156 |
+
new_width = self.constrain_to_multiple_of(scale_width * width)
|
157 |
+
else:
|
158 |
+
raise ValueError(f"resize_method {self.__resize_method} not implemented")
|
159 |
+
|
160 |
+
return (new_width, new_height)
|
161 |
+
|
162 |
+
def __call__(self, sample):
|
163 |
+
width, height = self.get_size(
|
164 |
+
sample["image"].shape[1], sample["image"].shape[0]
|
165 |
+
)
|
166 |
+
|
167 |
+
# resize sample
|
168 |
+
sample["image"] = cv2.resize(
|
169 |
+
sample["image"],
|
170 |
+
(width, height),
|
171 |
+
interpolation=self.__image_interpolation_method,
|
172 |
+
)
|
173 |
+
|
174 |
+
if self.__resize_target:
|
175 |
+
if "disparity" in sample:
|
176 |
+
sample["disparity"] = cv2.resize(
|
177 |
+
sample["disparity"],
|
178 |
+
(width, height),
|
179 |
+
interpolation=cv2.INTER_NEAREST,
|
180 |
+
)
|
181 |
+
|
182 |
+
if "depth" in sample:
|
183 |
+
sample["depth"] = cv2.resize(
|
184 |
+
sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST
|
185 |
+
)
|
186 |
+
|
187 |
+
sample["mask"] = cv2.resize(
|
188 |
+
sample["mask"].astype(np.float32),
|
189 |
+
(width, height),
|
190 |
+
interpolation=cv2.INTER_NEAREST,
|
191 |
+
)
|
192 |
+
sample["mask"] = sample["mask"].astype(bool)
|
193 |
+
|
194 |
+
return sample
|
195 |
+
|
196 |
+
|
197 |
+
class NormalizeImage(object):
|
198 |
+
"""Normlize image by given mean and std.
|
199 |
+
"""
|
200 |
+
|
201 |
+
def __init__(self, mean, std):
|
202 |
+
self.__mean = mean
|
203 |
+
self.__std = std
|
204 |
+
|
205 |
+
def __call__(self, sample):
|
206 |
+
sample["image"] = (sample["image"] - self.__mean) / self.__std
|
207 |
+
|
208 |
+
return sample
|
209 |
+
|
210 |
+
|
211 |
+
class PrepareForNet(object):
|
212 |
+
"""Prepare sample for usage as network input.
|
213 |
+
"""
|
214 |
+
|
215 |
+
def __init__(self):
|
216 |
+
pass
|
217 |
+
|
218 |
+
def __call__(self, sample):
|
219 |
+
image = np.transpose(sample["image"], (2, 0, 1))
|
220 |
+
sample["image"] = np.ascontiguousarray(image).astype(np.float32)
|
221 |
+
|
222 |
+
if "mask" in sample:
|
223 |
+
sample["mask"] = sample["mask"].astype(np.float32)
|
224 |
+
sample["mask"] = np.ascontiguousarray(sample["mask"])
|
225 |
+
|
226 |
+
if "disparity" in sample:
|
227 |
+
disparity = sample["disparity"].astype(np.float32)
|
228 |
+
sample["disparity"] = np.ascontiguousarray(disparity)
|
229 |
+
|
230 |
+
if "depth" in sample:
|
231 |
+
depth = sample["depth"].astype(np.float32)
|
232 |
+
sample["depth"] = np.ascontiguousarray(depth)
|
233 |
+
|
234 |
+
return sample
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/output/.placeholder
ADDED
File without changes
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2020 Alexey
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/README.md
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MiDaS for ROS1 by using LibTorch in C++
|
2 |
+
|
3 |
+
### Requirements
|
4 |
+
|
5 |
+
- Ubuntu 17.10 / 18.04 / 20.04, Debian Stretch
|
6 |
+
- ROS Melodic for Ubuntu (17.10 / 18.04) / Debian Stretch, ROS Noetic for Ubuntu 20.04
|
7 |
+
- C++11
|
8 |
+
- LibTorch >= 1.6
|
9 |
+
|
10 |
+
## Quick Start with a MiDaS Example
|
11 |
+
|
12 |
+
MiDaS is a neural network to compute depth from a single image.
|
13 |
+
|
14 |
+
* input from `image_topic`: `sensor_msgs/Image` - `RGB8` image with any shape
|
15 |
+
* output to `midas_topic`: `sensor_msgs/Image` - `TYPE_32FC1` inverse relative depth maps in range [0 - 255] with original size and channels=1
|
16 |
+
|
17 |
+
### Install Dependecies
|
18 |
+
|
19 |
+
* install ROS Melodic for Ubuntu 17.10 / 18.04:
|
20 |
+
```bash
|
21 |
+
wget https://raw.githubusercontent.com/isl-org/MiDaS/master/ros/additions/install_ros_melodic_ubuntu_17_18.sh
|
22 |
+
./install_ros_melodic_ubuntu_17_18.sh
|
23 |
+
```
|
24 |
+
|
25 |
+
or Noetic for Ubuntu 20.04:
|
26 |
+
|
27 |
+
```bash
|
28 |
+
wget https://raw.githubusercontent.com/isl-org/MiDaS/master/ros/additions/install_ros_noetic_ubuntu_20.sh
|
29 |
+
./install_ros_noetic_ubuntu_20.sh
|
30 |
+
```
|
31 |
+
|
32 |
+
|
33 |
+
* install LibTorch 1.7 with CUDA 11.0:
|
34 |
+
|
35 |
+
On **Jetson (ARM)**:
|
36 |
+
```bash
|
37 |
+
wget https://nvidia.box.com/shared/static/wa34qwrwtk9njtyarwt5nvo6imenfy26.whl -O torch-1.7.0-cp36-cp36m-linux_aarch64.whl
|
38 |
+
sudo apt-get install python3-pip libopenblas-base libopenmpi-dev
|
39 |
+
pip3 install Cython
|
40 |
+
pip3 install numpy torch-1.7.0-cp36-cp36m-linux_aarch64.whl
|
41 |
+
```
|
42 |
+
Or compile LibTorch from source: https://github.com/pytorch/pytorch#from-source
|
43 |
+
|
44 |
+
On **Linux (x86_64)**:
|
45 |
+
```bash
|
46 |
+
cd ~/
|
47 |
+
wget https://download.pytorch.org/libtorch/cu110/libtorch-cxx11-abi-shared-with-deps-1.7.0%2Bcu110.zip
|
48 |
+
unzip libtorch-cxx11-abi-shared-with-deps-1.7.0+cu110.zip
|
49 |
+
```
|
50 |
+
|
51 |
+
* create symlink for OpenCV:
|
52 |
+
|
53 |
+
```bash
|
54 |
+
sudo ln -s /usr/include/opencv4 /usr/include/opencv
|
55 |
+
```
|
56 |
+
|
57 |
+
* download and install MiDaS:
|
58 |
+
|
59 |
+
```bash
|
60 |
+
source ~/.bashrc
|
61 |
+
cd ~/
|
62 |
+
mkdir catkin_ws
|
63 |
+
cd catkin_ws
|
64 |
+
git clone https://github.com/isl-org/MiDaS
|
65 |
+
mkdir src
|
66 |
+
cp -r MiDaS/ros/* src
|
67 |
+
|
68 |
+
chmod +x src/additions/*.sh
|
69 |
+
chmod +x src/*.sh
|
70 |
+
chmod +x src/midas_cpp/scripts/*.py
|
71 |
+
cp src/additions/do_catkin_make.sh ./do_catkin_make.sh
|
72 |
+
./do_catkin_make.sh
|
73 |
+
./src/additions/downloads.sh
|
74 |
+
```
|
75 |
+
|
76 |
+
### Usage
|
77 |
+
|
78 |
+
* run only `midas` node: `~/catkin_ws/src/launch_midas_cpp.sh`
|
79 |
+
|
80 |
+
#### Test
|
81 |
+
|
82 |
+
* Test - capture video and show result in the window:
|
83 |
+
* place any `test.mp4` video file to the directory `~/catkin_ws/src/`
|
84 |
+
* run `midas` node: `~/catkin_ws/src/launch_midas_cpp.sh`
|
85 |
+
* run test nodes in another terminal: `cd ~/catkin_ws/src && ./run_talker_listener_test.sh` and wait 30 seconds
|
86 |
+
|
87 |
+
(to use Python 2, run command `sed -i 's/python3/python2/' ~/catkin_ws/src/midas_cpp/scripts/*.py` )
|
88 |
+
|
89 |
+
## Mobile version of MiDaS - Monocular Depth Estimation
|
90 |
+
|
91 |
+
### Accuracy
|
92 |
+
|
93 |
+
* MiDaS v2 small - ResNet50 default-decoder 384x384
|
94 |
+
* MiDaS v2.1 small - EfficientNet-Lite3 small-decoder 256x256
|
95 |
+
|
96 |
+
**Zero-shot error** (the lower - the better):
|
97 |
+
|
98 |
+
| Model | DIW WHDR | Eth3d AbsRel | Sintel AbsRel | Kitti δ>1.25 | NyuDepthV2 δ>1.25 | TUM δ>1.25 |
|
99 |
+
|---|---|---|---|---|---|---|
|
100 |
+
| MiDaS v2 small 384x384 | **0.1248** | 0.1550 | **0.3300** | **21.81** | 15.73 | 17.00 |
|
101 |
+
| MiDaS v2.1 small 256x256 | 0.1344 | **0.1344** | 0.3370 | 29.27 | **13.43** | **14.53** |
|
102 |
+
| Relative improvement, % | -8 % | **+13 %** | -2 % | -34 % | **+15 %** | **+15 %** |
|
103 |
+
|
104 |
+
None of Train/Valid/Test subsets of datasets (DIW, Eth3d, Sintel, Kitti, NyuDepthV2, TUM) were not involved in Training or Fine Tuning.
|
105 |
+
|
106 |
+
### Inference speed (FPS) on nVidia GPU
|
107 |
+
|
108 |
+
Inference speed excluding pre and post processing, batch=1, **Frames Per Second** (the higher - the better):
|
109 |
+
|
110 |
+
| Model | Jetson Nano, FPS | RTX 2080Ti, FPS |
|
111 |
+
|---|---|---|
|
112 |
+
| MiDaS v2 small 384x384 | 1.6 | 117 |
|
113 |
+
| MiDaS v2.1 small 256x256 | 8.1 | 232 |
|
114 |
+
| SpeedUp, X times | **5x** | **2x** |
|
115 |
+
|
116 |
+
### Citation
|
117 |
+
|
118 |
+
This repository contains code to compute depth from a single image. It accompanies our [paper](https://arxiv.org/abs/1907.01341v3):
|
119 |
+
|
120 |
+
>Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer
|
121 |
+
René Ranftl, Katrin Lasinger, David Hafner, Konrad Schindler, Vladlen Koltun
|
122 |
+
|
123 |
+
Please cite our paper if you use this code or any of the models:
|
124 |
+
```
|
125 |
+
@article{Ranftl2020,
|
126 |
+
author = {Ren\'{e} Ranftl and Katrin Lasinger and David Hafner and Konrad Schindler and Vladlen Koltun},
|
127 |
+
title = {Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer},
|
128 |
+
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)},
|
129 |
+
year = {2020},
|
130 |
+
}
|
131 |
+
```
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/additions/do_catkin_make.sh
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
mkdir src
|
2 |
+
catkin_make
|
3 |
+
source devel/setup.bash
|
4 |
+
echo $ROS_PACKAGE_PATH
|
5 |
+
chmod +x ./devel/setup.bash
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/additions/downloads.sh
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
mkdir ~/.ros
|
2 |
+
wget https://github.com/isl-org/MiDaS/releases/download/v2_1/model-small-traced.pt
|
3 |
+
cp ./model-small-traced.pt ~/.ros/model-small-traced.pt
|
4 |
+
|
5 |
+
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/additions/install_ros_melodic_ubuntu_17_18.sh
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#@title { display-mode: "code" }
|
2 |
+
|
3 |
+
#from http://wiki.ros.org/indigo/Installation/Ubuntu
|
4 |
+
|
5 |
+
#1.2 Setup sources.list
|
6 |
+
sudo sh -c 'echo "deb http://packages.ros.org/ros/ubuntu $(lsb_release -sc) main" > /etc/apt/sources.list.d/ros-latest.list'
|
7 |
+
|
8 |
+
# 1.3 Setup keys
|
9 |
+
sudo apt-key adv --keyserver 'hkp://keyserver.ubuntu.com:80' --recv-key C1CF6E31E6BADE8868B172B4F42ED6FBAB17C654
|
10 |
+
sudo apt-key adv --keyserver 'hkp://ha.pool.sks-keyservers.net:80' --recv-key 421C365BD9FF1F717815A3895523BAEEB01FA116
|
11 |
+
|
12 |
+
curl -sSL 'http://keyserver.ubuntu.com/pks/lookup?op=get&search=0xC1CF6E31E6BADE8868B172B4F42ED6FBAB17C654' | sudo apt-key add -
|
13 |
+
|
14 |
+
# 1.4 Installation
|
15 |
+
sudo apt-get update
|
16 |
+
sudo apt-get upgrade
|
17 |
+
|
18 |
+
# Desktop-Full Install:
|
19 |
+
sudo apt-get install ros-melodic-desktop-full
|
20 |
+
|
21 |
+
printf "\nsource /opt/ros/melodic/setup.bash\n" >> ~/.bashrc
|
22 |
+
|
23 |
+
# 1.5 Initialize rosdep
|
24 |
+
sudo rosdep init
|
25 |
+
rosdep update
|
26 |
+
|
27 |
+
|
28 |
+
# 1.7 Getting rosinstall (python)
|
29 |
+
sudo apt-get install python-rosinstall
|
30 |
+
sudo apt-get install python-catkin-tools
|
31 |
+
sudo apt-get install python-rospy
|
32 |
+
sudo apt-get install python-rosdep
|
33 |
+
sudo apt-get install python-roscd
|
34 |
+
sudo apt-get install python-pip
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/additions/install_ros_noetic_ubuntu_20.sh
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#@title { display-mode: "code" }
|
2 |
+
|
3 |
+
#from http://wiki.ros.org/indigo/Installation/Ubuntu
|
4 |
+
|
5 |
+
#1.2 Setup sources.list
|
6 |
+
sudo sh -c 'echo "deb http://packages.ros.org/ros/ubuntu $(lsb_release -sc) main" > /etc/apt/sources.list.d/ros-latest.list'
|
7 |
+
|
8 |
+
# 1.3 Setup keys
|
9 |
+
sudo apt-key adv --keyserver 'hkp://keyserver.ubuntu.com:80' --recv-key C1CF6E31E6BADE8868B172B4F42ED6FBAB17C654
|
10 |
+
|
11 |
+
curl -sSL 'http://keyserver.ubuntu.com/pks/lookup?op=get&search=0xC1CF6E31E6BADE8868B172B4F42ED6FBAB17C654' | sudo apt-key add -
|
12 |
+
|
13 |
+
# 1.4 Installation
|
14 |
+
sudo apt-get update
|
15 |
+
sudo apt-get upgrade
|
16 |
+
|
17 |
+
# Desktop-Full Install:
|
18 |
+
sudo apt-get install ros-noetic-desktop-full
|
19 |
+
|
20 |
+
printf "\nsource /opt/ros/noetic/setup.bash\n" >> ~/.bashrc
|
21 |
+
|
22 |
+
# 1.5 Initialize rosdep
|
23 |
+
sudo rosdep init
|
24 |
+
rosdep update
|
25 |
+
|
26 |
+
|
27 |
+
# 1.7 Getting rosinstall (python)
|
28 |
+
sudo apt-get install python3-rosinstall
|
29 |
+
sudo apt-get install python3-catkin-tools
|
30 |
+
sudo apt-get install python3-rospy
|
31 |
+
sudo apt-get install python3-rosdep
|
32 |
+
sudo apt-get install python3-roscd
|
33 |
+
sudo apt-get install python3-pip
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/additions/make_package_cpp.sh
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
cd ~/catkin_ws/src
|
2 |
+
catkin_create_pkg midas_cpp std_msgs roscpp cv_bridge sensor_msgs image_transport
|
3 |
+
cd ~/catkin_ws
|
4 |
+
catkin_make
|
5 |
+
|
6 |
+
chmod +x ~/catkin_ws/devel/setup.bash
|
7 |
+
printf "\nsource ~/catkin_ws/devel/setup.bash" >> ~/.bashrc
|
8 |
+
source ~/catkin_ws/devel/setup.bash
|
9 |
+
|
10 |
+
|
11 |
+
sudo rosdep init
|
12 |
+
rosdep update
|
13 |
+
#rospack depends1 midas_cpp
|
14 |
+
roscd midas_cpp
|
15 |
+
#cat package.xml
|
16 |
+
#rospack depends midas_cpp
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/launch_midas_cpp.sh
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
source ~/catkin_ws/devel/setup.bash
|
2 |
+
roslaunch midas_cpp midas_cpp.launch model_name:="model-small-traced.pt" input_topic:="image_topic" output_topic:="midas_topic" out_orig_size:="true"
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/CMakeLists.txt
ADDED
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
cmake_minimum_required(VERSION 3.0.2)
|
2 |
+
project(midas_cpp)
|
3 |
+
|
4 |
+
## Compile as C++11, supported in ROS Kinetic and newer
|
5 |
+
# add_compile_options(-std=c++11)
|
6 |
+
|
7 |
+
## Find catkin macros and libraries
|
8 |
+
## if COMPONENTS list like find_package(catkin REQUIRED COMPONENTS xyz)
|
9 |
+
## is used, also find other catkin packages
|
10 |
+
find_package(catkin REQUIRED COMPONENTS
|
11 |
+
cv_bridge
|
12 |
+
image_transport
|
13 |
+
roscpp
|
14 |
+
rospy
|
15 |
+
sensor_msgs
|
16 |
+
std_msgs
|
17 |
+
)
|
18 |
+
|
19 |
+
## System dependencies are found with CMake's conventions
|
20 |
+
# find_package(Boost REQUIRED COMPONENTS system)
|
21 |
+
|
22 |
+
list(APPEND CMAKE_PREFIX_PATH "~/libtorch")
|
23 |
+
list(APPEND CMAKE_PREFIX_PATH "/usr/local/lib/python3.6/dist-packages/torch/lib")
|
24 |
+
list(APPEND CMAKE_PREFIX_PATH "/usr/local/lib/python2.7/dist-packages/torch/lib")
|
25 |
+
|
26 |
+
if(NOT EXISTS "~/libtorch")
|
27 |
+
if (EXISTS "/usr/local/lib/python3.6/dist-packages/torch")
|
28 |
+
include_directories(/usr/local/include)
|
29 |
+
include_directories(/usr/local/lib/python3.6/dist-packages/torch/include/torch/csrc/api/include)
|
30 |
+
include_directories(/usr/local/lib/python3.6/dist-packages/torch/include)
|
31 |
+
|
32 |
+
link_directories(/usr/local/lib)
|
33 |
+
link_directories(/usr/local/lib/python3.6/dist-packages/torch/lib)
|
34 |
+
|
35 |
+
set(CMAKE_PREFIX_PATH /usr/local/lib/python3.6/dist-packages/torch)
|
36 |
+
set(Boost_USE_MULTITHREADED ON)
|
37 |
+
set(Torch_DIR /usr/local/lib/python3.6/dist-packages/torch)
|
38 |
+
|
39 |
+
elseif (EXISTS "/usr/local/lib/python2.7/dist-packages/torch")
|
40 |
+
|
41 |
+
include_directories(/usr/local/include)
|
42 |
+
include_directories(/usr/local/lib/python2.7/dist-packages/torch/include/torch/csrc/api/include)
|
43 |
+
include_directories(/usr/local/lib/python2.7/dist-packages/torch/include)
|
44 |
+
|
45 |
+
link_directories(/usr/local/lib)
|
46 |
+
link_directories(/usr/local/lib/python2.7/dist-packages/torch/lib)
|
47 |
+
|
48 |
+
set(CMAKE_PREFIX_PATH /usr/local/lib/python2.7/dist-packages/torch)
|
49 |
+
set(Boost_USE_MULTITHREADED ON)
|
50 |
+
set(Torch_DIR /usr/local/lib/python2.7/dist-packages/torch)
|
51 |
+
endif()
|
52 |
+
endif()
|
53 |
+
|
54 |
+
|
55 |
+
|
56 |
+
find_package(Torch REQUIRED)
|
57 |
+
find_package(OpenCV REQUIRED)
|
58 |
+
include_directories( ${OpenCV_INCLUDE_DIRS} )
|
59 |
+
|
60 |
+
add_executable(midas_cpp src/main.cpp)
|
61 |
+
target_link_libraries(midas_cpp "${TORCH_LIBRARIES}" "${OpenCV_LIBS} ${catkin_LIBRARIES}")
|
62 |
+
set_property(TARGET midas_cpp PROPERTY CXX_STANDARD 14)
|
63 |
+
|
64 |
+
|
65 |
+
|
66 |
+
###################################
|
67 |
+
## catkin specific configuration ##
|
68 |
+
###################################
|
69 |
+
## The catkin_package macro generates cmake config files for your package
|
70 |
+
## Declare things to be passed to dependent projects
|
71 |
+
## INCLUDE_DIRS: uncomment this if your package contains header files
|
72 |
+
## LIBRARIES: libraries you create in this project that dependent projects also need
|
73 |
+
## CATKIN_DEPENDS: catkin_packages dependent projects also need
|
74 |
+
## DEPENDS: system dependencies of this project that dependent projects also need
|
75 |
+
catkin_package(
|
76 |
+
# INCLUDE_DIRS include
|
77 |
+
# LIBRARIES midas_cpp
|
78 |
+
# CATKIN_DEPENDS cv_bridge image_transport roscpp sensor_msgs std_msgs
|
79 |
+
# DEPENDS system_lib
|
80 |
+
)
|
81 |
+
|
82 |
+
###########
|
83 |
+
## Build ##
|
84 |
+
###########
|
85 |
+
|
86 |
+
## Specify additional locations of header files
|
87 |
+
## Your package locations should be listed before other locations
|
88 |
+
include_directories(
|
89 |
+
# include
|
90 |
+
${catkin_INCLUDE_DIRS}
|
91 |
+
)
|
92 |
+
|
93 |
+
## Declare a C++ library
|
94 |
+
# add_library(${PROJECT_NAME}
|
95 |
+
# src/${PROJECT_NAME}/midas_cpp.cpp
|
96 |
+
# )
|
97 |
+
|
98 |
+
## Add cmake target dependencies of the library
|
99 |
+
## as an example, code may need to be generated before libraries
|
100 |
+
## either from message generation or dynamic reconfigure
|
101 |
+
# add_dependencies(${PROJECT_NAME} ${${PROJECT_NAME}_EXPORTED_TARGETS} ${catkin_EXPORTED_TARGETS})
|
102 |
+
|
103 |
+
## Declare a C++ executable
|
104 |
+
## With catkin_make all packages are built within a single CMake context
|
105 |
+
## The recommended prefix ensures that target names across packages don't collide
|
106 |
+
# add_executable(${PROJECT_NAME}_node src/midas_cpp_node.cpp)
|
107 |
+
|
108 |
+
## Rename C++ executable without prefix
|
109 |
+
## The above recommended prefix causes long target names, the following renames the
|
110 |
+
## target back to the shorter version for ease of user use
|
111 |
+
## e.g. "rosrun someones_pkg node" instead of "rosrun someones_pkg someones_pkg_node"
|
112 |
+
# set_target_properties(${PROJECT_NAME}_node PROPERTIES OUTPUT_NAME node PREFIX "")
|
113 |
+
|
114 |
+
## Add cmake target dependencies of the executable
|
115 |
+
## same as for the library above
|
116 |
+
# add_dependencies(${PROJECT_NAME}_node ${${PROJECT_NAME}_EXPORTED_TARGETS} ${catkin_EXPORTED_TARGETS})
|
117 |
+
|
118 |
+
## Specify libraries to link a library or executable target against
|
119 |
+
# target_link_libraries(${PROJECT_NAME}_node
|
120 |
+
# ${catkin_LIBRARIES}
|
121 |
+
# )
|
122 |
+
|
123 |
+
#############
|
124 |
+
## Install ##
|
125 |
+
#############
|
126 |
+
|
127 |
+
# all install targets should use catkin DESTINATION variables
|
128 |
+
# See http://ros.org/doc/api/catkin/html/adv_user_guide/variables.html
|
129 |
+
|
130 |
+
## Mark executable scripts (Python etc.) for installation
|
131 |
+
## in contrast to setup.py, you can choose the destination
|
132 |
+
# catkin_install_python(PROGRAMS
|
133 |
+
# scripts/my_python_script
|
134 |
+
# DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}
|
135 |
+
# )
|
136 |
+
|
137 |
+
## Mark executables for installation
|
138 |
+
## See http://docs.ros.org/melodic/api/catkin/html/howto/format1/building_executables.html
|
139 |
+
# install(TARGETS ${PROJECT_NAME}_node
|
140 |
+
# RUNTIME DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}
|
141 |
+
# )
|
142 |
+
|
143 |
+
## Mark libraries for installation
|
144 |
+
## See http://docs.ros.org/melodic/api/catkin/html/howto/format1/building_libraries.html
|
145 |
+
# install(TARGETS ${PROJECT_NAME}
|
146 |
+
# ARCHIVE DESTINATION ${CATKIN_PACKAGE_LIB_DESTINATION}
|
147 |
+
# LIBRARY DESTINATION ${CATKIN_PACKAGE_LIB_DESTINATION}
|
148 |
+
# RUNTIME DESTINATION ${CATKIN_GLOBAL_BIN_DESTINATION}
|
149 |
+
# )
|
150 |
+
|
151 |
+
## Mark cpp header files for installation
|
152 |
+
# install(DIRECTORY include/${PROJECT_NAME}/
|
153 |
+
# DESTINATION ${CATKIN_PACKAGE_INCLUDE_DESTINATION}
|
154 |
+
# FILES_MATCHING PATTERN "*.h"
|
155 |
+
# PATTERN ".svn" EXCLUDE
|
156 |
+
# )
|
157 |
+
|
158 |
+
## Mark other files for installation (e.g. launch and bag files, etc.)
|
159 |
+
# install(FILES
|
160 |
+
# # myfile1
|
161 |
+
# # myfile2
|
162 |
+
# DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION}
|
163 |
+
# )
|
164 |
+
|
165 |
+
#############
|
166 |
+
## Testing ##
|
167 |
+
#############
|
168 |
+
|
169 |
+
## Add gtest based cpp test target and link libraries
|
170 |
+
# catkin_add_gtest(${PROJECT_NAME}-test test/test_midas_cpp.cpp)
|
171 |
+
# if(TARGET ${PROJECT_NAME}-test)
|
172 |
+
# target_link_libraries(${PROJECT_NAME}-test ${PROJECT_NAME})
|
173 |
+
# endif()
|
174 |
+
|
175 |
+
## Add folders to be run by python nosetests
|
176 |
+
# catkin_add_nosetests(test)
|
177 |
+
|
178 |
+
install(TARGETS ${PROJECT_NAME}
|
179 |
+
ARCHIVE DESTINATION ${CATKIN_PACKAGE_LIB_DESTINATION}
|
180 |
+
LIBRARY DESTINATION ${CATKIN_PACKAGE_LIB_DESTINATION}
|
181 |
+
RUNTIME DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}
|
182 |
+
)
|
183 |
+
|
184 |
+
add_custom_command(
|
185 |
+
TARGET midas_cpp POST_BUILD
|
186 |
+
COMMAND ${CMAKE_COMMAND} -E copy
|
187 |
+
${CMAKE_CURRENT_BINARY_DIR}/midas_cpp
|
188 |
+
${CMAKE_SOURCE_DIR}/midas_cpp
|
189 |
+
)
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/launch/midas_cpp.launch
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<launch>
|
2 |
+
<arg name="input_topic" default="image_topic"/>
|
3 |
+
<arg name="output_topic" default="midas_topic"/>
|
4 |
+
<arg name="model_name" default="model-small-traced.pt"/>
|
5 |
+
<arg name="out_orig_size" default="true"/>
|
6 |
+
<arg name="net_width" default="256"/>
|
7 |
+
<arg name="net_height" default="256"/>
|
8 |
+
<arg name="logging" default="false"/>
|
9 |
+
|
10 |
+
<node pkg="midas_cpp" type="midas_cpp" name="midas_cpp" output="log" respawn="true">
|
11 |
+
<param name="input_topic" value="$(arg input_topic)"/>
|
12 |
+
<param name="output_topic" value="$(arg output_topic)"/>
|
13 |
+
<param name="model_name" value="$(arg model_name)"/>
|
14 |
+
<param name="out_orig_size" value="$(arg out_orig_size)"/>
|
15 |
+
<param name="net_width" value="$(arg net_width)"/>
|
16 |
+
<param name="net_height" value="$(arg net_height)"/>
|
17 |
+
<param name="logging" value="$(arg logging)"/>
|
18 |
+
</node>
|
19 |
+
</launch>
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/launch/midas_talker_listener.launch
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<launch>
|
2 |
+
<arg name="use_camera" default="false"/>
|
3 |
+
<arg name="input_video_file" default="test.mp4"/>
|
4 |
+
|
5 |
+
<arg name="show_output" default="true"/>
|
6 |
+
<arg name="save_output" default="false"/>
|
7 |
+
<arg name="output_video_file" default="result.mp4"/>
|
8 |
+
|
9 |
+
<node pkg="midas_cpp" type="talker.py" name="talker" output="log" respawn="true">
|
10 |
+
<param name="use_camera" value="$(arg use_camera)"/>
|
11 |
+
<param name="input_video_file" value="$(arg input_video_file)"/>
|
12 |
+
</node>
|
13 |
+
|
14 |
+
<node pkg="midas_cpp" type="listener.py" name="listener" output="log" respawn="true">
|
15 |
+
<param name="show_output" value="$(arg show_output)"/>
|
16 |
+
<param name="save_output" value="$(arg save_output)"/>
|
17 |
+
<param name="output_video_file" value="$(arg output_video_file)"/>
|
18 |
+
</node>
|
19 |
+
|
20 |
+
<node pkg="midas_cpp" type="listener_original.py" name="listener_original" output="log" respawn="true">
|
21 |
+
<param name="show_output" value="$(arg show_output)"/>
|
22 |
+
</node>
|
23 |
+
</launch>
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/package.xml
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0"?>
|
2 |
+
<package format="2">
|
3 |
+
<name>midas_cpp</name>
|
4 |
+
<version>0.1.0</version>
|
5 |
+
<description>The midas_cpp package</description>
|
6 |
+
|
7 |
+
<maintainer email="alexeyab84@gmail.com">Alexey Bochkovskiy</maintainer>
|
8 |
+
<license>MIT</license>
|
9 |
+
<url type="website">https://github.com/isl-org/MiDaS/tree/master/ros</url>
|
10 |
+
<!-- <author email="alexeyab84@gmail.com">Alexey Bochkovskiy</author> -->
|
11 |
+
|
12 |
+
|
13 |
+
<!-- One license tag required, multiple allowed, one license per tag -->
|
14 |
+
<!-- Commonly used license strings: -->
|
15 |
+
<!-- BSD, MIT, Boost Software License, GPLv2, GPLv3, LGPLv2.1, LGPLv3 -->
|
16 |
+
<license>TODO</license>
|
17 |
+
|
18 |
+
|
19 |
+
<!-- Url tags are optional, but multiple are allowed, one per tag -->
|
20 |
+
<!-- Optional attribute type can be: website, bugtracker, or repository -->
|
21 |
+
<!-- Example: -->
|
22 |
+
<!-- <url type="website">http://wiki.ros.org/midas_cpp</url> -->
|
23 |
+
|
24 |
+
|
25 |
+
<!-- Author tags are optional, multiple are allowed, one per tag -->
|
26 |
+
<!-- Authors do not have to be maintainers, but could be -->
|
27 |
+
<!-- Example: -->
|
28 |
+
<!-- <author email="jane.doe@example.com">Jane Doe</author> -->
|
29 |
+
|
30 |
+
|
31 |
+
<!-- The *depend tags are used to specify dependencies -->
|
32 |
+
<!-- Dependencies can be catkin packages or system dependencies -->
|
33 |
+
<!-- Examples: -->
|
34 |
+
<!-- Use depend as a shortcut for packages that are both build and exec dependencies -->
|
35 |
+
<!-- <depend>roscpp</depend> -->
|
36 |
+
<!-- Note that this is equivalent to the following: -->
|
37 |
+
<!-- <build_depend>roscpp</build_depend> -->
|
38 |
+
<!-- <exec_depend>roscpp</exec_depend> -->
|
39 |
+
<!-- Use build_depend for packages you need at compile time: -->
|
40 |
+
<!-- <build_depend>message_generation</build_depend> -->
|
41 |
+
<!-- Use build_export_depend for packages you need in order to build against this package: -->
|
42 |
+
<!-- <build_export_depend>message_generation</build_export_depend> -->
|
43 |
+
<!-- Use buildtool_depend for build tool packages: -->
|
44 |
+
<!-- <buildtool_depend>catkin</buildtool_depend> -->
|
45 |
+
<!-- Use exec_depend for packages you need at runtime: -->
|
46 |
+
<!-- <exec_depend>message_runtime</exec_depend> -->
|
47 |
+
<!-- Use test_depend for packages you need only for testing: -->
|
48 |
+
<!-- <test_depend>gtest</test_depend> -->
|
49 |
+
<!-- Use doc_depend for packages you need only for building documentation: -->
|
50 |
+
<!-- <doc_depend>doxygen</doc_depend> -->
|
51 |
+
<buildtool_depend>catkin</buildtool_depend>
|
52 |
+
<build_depend>cv_bridge</build_depend>
|
53 |
+
<build_depend>image_transport</build_depend>
|
54 |
+
<build_depend>roscpp</build_depend>
|
55 |
+
<build_depend>rospy</build_depend>
|
56 |
+
<build_depend>sensor_msgs</build_depend>
|
57 |
+
<build_depend>std_msgs</build_depend>
|
58 |
+
<build_export_depend>cv_bridge</build_export_depend>
|
59 |
+
<build_export_depend>image_transport</build_export_depend>
|
60 |
+
<build_export_depend>roscpp</build_export_depend>
|
61 |
+
<build_export_depend>rospy</build_export_depend>
|
62 |
+
<build_export_depend>sensor_msgs</build_export_depend>
|
63 |
+
<build_export_depend>std_msgs</build_export_depend>
|
64 |
+
<exec_depend>cv_bridge</exec_depend>
|
65 |
+
<exec_depend>image_transport</exec_depend>
|
66 |
+
<exec_depend>roscpp</exec_depend>
|
67 |
+
<exec_depend>rospy</exec_depend>
|
68 |
+
<exec_depend>sensor_msgs</exec_depend>
|
69 |
+
<exec_depend>std_msgs</exec_depend>
|
70 |
+
|
71 |
+
|
72 |
+
<!-- The export tag contains other, unspecified, tags -->
|
73 |
+
<export>
|
74 |
+
<!-- Other tools can request additional information be placed here -->
|
75 |
+
|
76 |
+
</export>
|
77 |
+
</package>
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/scripts/listener.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
from __future__ import print_function
|
3 |
+
|
4 |
+
import roslib
|
5 |
+
#roslib.load_manifest('my_package')
|
6 |
+
import sys
|
7 |
+
import rospy
|
8 |
+
import cv2
|
9 |
+
import numpy as np
|
10 |
+
from std_msgs.msg import String
|
11 |
+
from sensor_msgs.msg import Image
|
12 |
+
from cv_bridge import CvBridge, CvBridgeError
|
13 |
+
|
14 |
+
class video_show:
|
15 |
+
|
16 |
+
def __init__(self):
|
17 |
+
self.show_output = rospy.get_param('~show_output', True)
|
18 |
+
self.save_output = rospy.get_param('~save_output', False)
|
19 |
+
self.output_video_file = rospy.get_param('~output_video_file','result.mp4')
|
20 |
+
# rospy.loginfo(f"Listener - params: show_output={self.show_output}, save_output={self.save_output}, output_video_file={self.output_video_file}")
|
21 |
+
|
22 |
+
self.bridge = CvBridge()
|
23 |
+
self.image_sub = rospy.Subscriber("midas_topic", Image, self.callback)
|
24 |
+
|
25 |
+
def callback(self, data):
|
26 |
+
try:
|
27 |
+
cv_image = self.bridge.imgmsg_to_cv2(data)
|
28 |
+
except CvBridgeError as e:
|
29 |
+
print(e)
|
30 |
+
return
|
31 |
+
|
32 |
+
if cv_image.size == 0:
|
33 |
+
return
|
34 |
+
|
35 |
+
rospy.loginfo("Listener: Received new frame")
|
36 |
+
cv_image = cv_image.astype("uint8")
|
37 |
+
|
38 |
+
if self.show_output==True:
|
39 |
+
cv2.imshow("video_show", cv_image)
|
40 |
+
cv2.waitKey(10)
|
41 |
+
|
42 |
+
if self.save_output==True:
|
43 |
+
if self.video_writer_init==False:
|
44 |
+
fourcc = cv2.VideoWriter_fourcc(*'XVID')
|
45 |
+
self.out = cv2.VideoWriter(self.output_video_file, fourcc, 25, (cv_image.shape[1], cv_image.shape[0]))
|
46 |
+
|
47 |
+
self.out.write(cv_image)
|
48 |
+
|
49 |
+
|
50 |
+
|
51 |
+
def main(args):
|
52 |
+
rospy.init_node('listener', anonymous=True)
|
53 |
+
ic = video_show()
|
54 |
+
try:
|
55 |
+
rospy.spin()
|
56 |
+
except KeyboardInterrupt:
|
57 |
+
print("Shutting down")
|
58 |
+
cv2.destroyAllWindows()
|
59 |
+
|
60 |
+
if __name__ == '__main__':
|
61 |
+
main(sys.argv)
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/scripts/listener_original.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
from __future__ import print_function
|
3 |
+
|
4 |
+
import roslib
|
5 |
+
#roslib.load_manifest('my_package')
|
6 |
+
import sys
|
7 |
+
import rospy
|
8 |
+
import cv2
|
9 |
+
import numpy as np
|
10 |
+
from std_msgs.msg import String
|
11 |
+
from sensor_msgs.msg import Image
|
12 |
+
from cv_bridge import CvBridge, CvBridgeError
|
13 |
+
|
14 |
+
class video_show:
|
15 |
+
|
16 |
+
def __init__(self):
|
17 |
+
self.show_output = rospy.get_param('~show_output', True)
|
18 |
+
self.save_output = rospy.get_param('~save_output', False)
|
19 |
+
self.output_video_file = rospy.get_param('~output_video_file','result.mp4')
|
20 |
+
# rospy.loginfo(f"Listener original - params: show_output={self.show_output}, save_output={self.save_output}, output_video_file={self.output_video_file}")
|
21 |
+
|
22 |
+
self.bridge = CvBridge()
|
23 |
+
self.image_sub = rospy.Subscriber("image_topic", Image, self.callback)
|
24 |
+
|
25 |
+
def callback(self, data):
|
26 |
+
try:
|
27 |
+
cv_image = self.bridge.imgmsg_to_cv2(data)
|
28 |
+
except CvBridgeError as e:
|
29 |
+
print(e)
|
30 |
+
return
|
31 |
+
|
32 |
+
if cv_image.size == 0:
|
33 |
+
return
|
34 |
+
|
35 |
+
rospy.loginfo("Listener_original: Received new frame")
|
36 |
+
cv_image = cv_image.astype("uint8")
|
37 |
+
|
38 |
+
if self.show_output==True:
|
39 |
+
cv2.imshow("video_show_orig", cv_image)
|
40 |
+
cv2.waitKey(10)
|
41 |
+
|
42 |
+
if self.save_output==True:
|
43 |
+
if self.video_writer_init==False:
|
44 |
+
fourcc = cv2.VideoWriter_fourcc(*'XVID')
|
45 |
+
self.out = cv2.VideoWriter(self.output_video_file, fourcc, 25, (cv_image.shape[1], cv_image.shape[0]))
|
46 |
+
|
47 |
+
self.out.write(cv_image)
|
48 |
+
|
49 |
+
|
50 |
+
|
51 |
+
def main(args):
|
52 |
+
rospy.init_node('listener_original', anonymous=True)
|
53 |
+
ic = video_show()
|
54 |
+
try:
|
55 |
+
rospy.spin()
|
56 |
+
except KeyboardInterrupt:
|
57 |
+
print("Shutting down")
|
58 |
+
cv2.destroyAllWindows()
|
59 |
+
|
60 |
+
if __name__ == '__main__':
|
61 |
+
main(sys.argv)
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/scripts/talker.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
|
3 |
+
|
4 |
+
import roslib
|
5 |
+
#roslib.load_manifest('my_package')
|
6 |
+
import sys
|
7 |
+
import rospy
|
8 |
+
import cv2
|
9 |
+
from std_msgs.msg import String
|
10 |
+
from sensor_msgs.msg import Image
|
11 |
+
from cv_bridge import CvBridge, CvBridgeError
|
12 |
+
|
13 |
+
|
14 |
+
def talker():
|
15 |
+
rospy.init_node('talker', anonymous=True)
|
16 |
+
|
17 |
+
use_camera = rospy.get_param('~use_camera', False)
|
18 |
+
input_video_file = rospy.get_param('~input_video_file','test.mp4')
|
19 |
+
# rospy.loginfo(f"Talker - params: use_camera={use_camera}, input_video_file={input_video_file}")
|
20 |
+
|
21 |
+
# rospy.loginfo("Talker: Trying to open a video stream")
|
22 |
+
if use_camera == True:
|
23 |
+
cap = cv2.VideoCapture(0)
|
24 |
+
else:
|
25 |
+
cap = cv2.VideoCapture(input_video_file)
|
26 |
+
|
27 |
+
pub = rospy.Publisher('image_topic', Image, queue_size=1)
|
28 |
+
rate = rospy.Rate(30) # 30hz
|
29 |
+
bridge = CvBridge()
|
30 |
+
|
31 |
+
while not rospy.is_shutdown():
|
32 |
+
ret, cv_image = cap.read()
|
33 |
+
if ret==False:
|
34 |
+
print("Talker: Video is over")
|
35 |
+
rospy.loginfo("Video is over")
|
36 |
+
return
|
37 |
+
|
38 |
+
try:
|
39 |
+
image = bridge.cv2_to_imgmsg(cv_image, "bgr8")
|
40 |
+
except CvBridgeError as e:
|
41 |
+
rospy.logerr("Talker: cv2image conversion failed: ", e)
|
42 |
+
print(e)
|
43 |
+
continue
|
44 |
+
|
45 |
+
rospy.loginfo("Talker: Publishing frame")
|
46 |
+
pub.publish(image)
|
47 |
+
rate.sleep()
|
48 |
+
|
49 |
+
if __name__ == '__main__':
|
50 |
+
try:
|
51 |
+
talker()
|
52 |
+
except rospy.ROSInterruptException:
|
53 |
+
pass
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/src/main.cpp
ADDED
@@ -0,0 +1,285 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include <ros/ros.h>
|
2 |
+
#include <image_transport/image_transport.h>
|
3 |
+
#include <cv_bridge/cv_bridge.h>
|
4 |
+
#include <sensor_msgs/image_encodings.h>
|
5 |
+
|
6 |
+
#include <initializer_list>
|
7 |
+
|
8 |
+
#include <torch/script.h> // One-stop header.
|
9 |
+
|
10 |
+
#include <opencv2/core/version.hpp>
|
11 |
+
#include <opencv2/imgproc/imgproc.hpp>
|
12 |
+
#include <opencv2/opencv.hpp>
|
13 |
+
#include <opencv2/opencv_modules.hpp>
|
14 |
+
|
15 |
+
#include <opencv2/highgui/highgui.hpp>
|
16 |
+
#include <opencv2/video/video.hpp>
|
17 |
+
|
18 |
+
// includes for OpenCV >= 3.x
|
19 |
+
#ifndef CV_VERSION_EPOCH
|
20 |
+
#include <opencv2/core/types.hpp>
|
21 |
+
#include <opencv2/videoio/videoio.hpp>
|
22 |
+
#include <opencv2/imgcodecs/imgcodecs.hpp>
|
23 |
+
#endif
|
24 |
+
|
25 |
+
// OpenCV includes for OpenCV 2.x
|
26 |
+
#ifdef CV_VERSION_EPOCH
|
27 |
+
#include <opencv2/highgui/highgui_c.h>
|
28 |
+
#include <opencv2/imgproc/imgproc_c.h>
|
29 |
+
#include <opencv2/core/types_c.h>
|
30 |
+
#include <opencv2/core/version.hpp>
|
31 |
+
#endif
|
32 |
+
|
33 |
+
static const std::string OPENCV_WINDOW = "Image window";
|
34 |
+
|
35 |
+
class Midas
|
36 |
+
{
|
37 |
+
ros::NodeHandle nh_;
|
38 |
+
image_transport::ImageTransport it_;
|
39 |
+
image_transport::Subscriber image_sub_;
|
40 |
+
image_transport::Publisher image_pub_;
|
41 |
+
|
42 |
+
torch::jit::script::Module module;
|
43 |
+
torch::Device device;
|
44 |
+
|
45 |
+
auto ToTensor(cv::Mat img, bool show_output = false, bool unsqueeze = false, int unsqueeze_dim = 0)
|
46 |
+
{
|
47 |
+
//std::cout << "image shape: " << img.size() << std::endl;
|
48 |
+
at::Tensor tensor_image = torch::from_blob(img.data, { img.rows, img.cols, 3 }, at::kByte);
|
49 |
+
|
50 |
+
if (unsqueeze)
|
51 |
+
{
|
52 |
+
tensor_image.unsqueeze_(unsqueeze_dim);
|
53 |
+
//std::cout << "tensors new shape: " << tensor_image.sizes() << std::endl;
|
54 |
+
}
|
55 |
+
|
56 |
+
if (show_output)
|
57 |
+
{
|
58 |
+
std::cout << tensor_image.slice(2, 0, 1) << std::endl;
|
59 |
+
}
|
60 |
+
//std::cout << "tenor shape: " << tensor_image.sizes() << std::endl;
|
61 |
+
return tensor_image;
|
62 |
+
}
|
63 |
+
|
64 |
+
auto ToInput(at::Tensor tensor_image)
|
65 |
+
{
|
66 |
+
// Create a vector of inputs.
|
67 |
+
return std::vector<torch::jit::IValue>{tensor_image};
|
68 |
+
}
|
69 |
+
|
70 |
+
auto ToCvImage(at::Tensor tensor, int cv_type = CV_8UC3)
|
71 |
+
{
|
72 |
+
int width = tensor.sizes()[0];
|
73 |
+
int height = tensor.sizes()[1];
|
74 |
+
try
|
75 |
+
{
|
76 |
+
cv::Mat output_mat;
|
77 |
+
if (cv_type == CV_8UC4 || cv_type == CV_8UC3 || cv_type == CV_8UC2 || cv_type == CV_8UC1) {
|
78 |
+
cv::Mat cv_image(cv::Size{ height, width }, cv_type, tensor.data_ptr<uchar>());
|
79 |
+
output_mat = cv_image;
|
80 |
+
}
|
81 |
+
else if (cv_type == CV_32FC4 || cv_type == CV_32FC3 || cv_type == CV_32FC2 || cv_type == CV_32FC1) {
|
82 |
+
cv::Mat cv_image(cv::Size{ height, width }, cv_type, tensor.data_ptr<float>());
|
83 |
+
output_mat = cv_image;
|
84 |
+
}
|
85 |
+
else if (cv_type == CV_64FC4 || cv_type == CV_64FC3 || cv_type == CV_64FC2 || cv_type == CV_64FC1) {
|
86 |
+
cv::Mat cv_image(cv::Size{ height, width }, cv_type, tensor.data_ptr<double>());
|
87 |
+
output_mat = cv_image;
|
88 |
+
}
|
89 |
+
|
90 |
+
//show_image(output_mat, "converted image from tensor");
|
91 |
+
return output_mat.clone();
|
92 |
+
}
|
93 |
+
catch (const c10::Error& e)
|
94 |
+
{
|
95 |
+
std::cout << "an error has occured : " << e.msg() << std::endl;
|
96 |
+
}
|
97 |
+
return cv::Mat(height, width, CV_8UC3);
|
98 |
+
}
|
99 |
+
|
100 |
+
std::string input_topic, output_topic, model_name;
|
101 |
+
bool out_orig_size;
|
102 |
+
int net_width, net_height;
|
103 |
+
torch::NoGradGuard guard;
|
104 |
+
at::Tensor mean, std;
|
105 |
+
at::Tensor output, tensor;
|
106 |
+
|
107 |
+
public:
|
108 |
+
Midas()
|
109 |
+
: nh_(), it_(nh_), device(torch::Device(torch::kCPU))
|
110 |
+
{
|
111 |
+
ros::param::param<std::string>("~input_topic", input_topic, "image_topic");
|
112 |
+
ros::param::param<std::string>("~output_topic", output_topic, "midas_topic");
|
113 |
+
ros::param::param<std::string>("~model_name", model_name, "model-small-traced.pt");
|
114 |
+
ros::param::param<bool>("~out_orig_size", out_orig_size, true);
|
115 |
+
ros::param::param<int>("~net_width", net_width, 256);
|
116 |
+
ros::param::param<int>("~net_height", net_height, 256);
|
117 |
+
|
118 |
+
std::cout << ", input_topic = " << input_topic <<
|
119 |
+
", output_topic = " << output_topic <<
|
120 |
+
", model_name = " << model_name <<
|
121 |
+
", out_orig_size = " << out_orig_size <<
|
122 |
+
", net_width = " << net_width <<
|
123 |
+
", net_height = " << net_height <<
|
124 |
+
std::endl;
|
125 |
+
|
126 |
+
// Subscrive to input video feed and publish output video feed
|
127 |
+
image_sub_ = it_.subscribe(input_topic, 1, &Midas::imageCb, this);
|
128 |
+
image_pub_ = it_.advertise(output_topic, 1);
|
129 |
+
|
130 |
+
std::cout << "Try to load torchscript model \n";
|
131 |
+
|
132 |
+
try {
|
133 |
+
// Deserialize the ScriptModule from a file using torch::jit::load().
|
134 |
+
module = torch::jit::load(model_name);
|
135 |
+
}
|
136 |
+
catch (const c10::Error& e) {
|
137 |
+
std::cerr << "error loading the model\n";
|
138 |
+
exit(0);
|
139 |
+
}
|
140 |
+
|
141 |
+
std::cout << "ok\n";
|
142 |
+
|
143 |
+
try {
|
144 |
+
module.eval();
|
145 |
+
torch::jit::getProfilingMode() = false;
|
146 |
+
torch::jit::setGraphExecutorOptimize(true);
|
147 |
+
|
148 |
+
mean = torch::tensor({ 0.485, 0.456, 0.406 });
|
149 |
+
std = torch::tensor({ 0.229, 0.224, 0.225 });
|
150 |
+
|
151 |
+
if (torch::hasCUDA()) {
|
152 |
+
std::cout << "cuda is available" << std::endl;
|
153 |
+
at::globalContext().setBenchmarkCuDNN(true);
|
154 |
+
device = torch::Device(torch::kCUDA);
|
155 |
+
module.to(device);
|
156 |
+
mean = mean.to(device);
|
157 |
+
std = std.to(device);
|
158 |
+
}
|
159 |
+
}
|
160 |
+
catch (const c10::Error& e)
|
161 |
+
{
|
162 |
+
std::cerr << " module initialization: " << e.msg() << std::endl;
|
163 |
+
}
|
164 |
+
}
|
165 |
+
|
166 |
+
~Midas()
|
167 |
+
{
|
168 |
+
}
|
169 |
+
|
170 |
+
void imageCb(const sensor_msgs::ImageConstPtr& msg)
|
171 |
+
{
|
172 |
+
cv_bridge::CvImagePtr cv_ptr;
|
173 |
+
try
|
174 |
+
{
|
175 |
+
// sensor_msgs::Image to cv::Mat
|
176 |
+
cv_ptr = cv_bridge::toCvCopy(msg, sensor_msgs::image_encodings::RGB8);
|
177 |
+
}
|
178 |
+
catch (cv_bridge::Exception& e)
|
179 |
+
{
|
180 |
+
ROS_ERROR("cv_bridge exception: %s", e.what());
|
181 |
+
return;
|
182 |
+
}
|
183 |
+
|
184 |
+
// pre-processing
|
185 |
+
auto tensor_cpu = ToTensor(cv_ptr->image); // OpenCV-image -> Libtorch-tensor
|
186 |
+
|
187 |
+
try {
|
188 |
+
tensor = tensor_cpu.to(device); // move to device (CPU or GPU)
|
189 |
+
|
190 |
+
tensor = tensor.toType(c10::kFloat);
|
191 |
+
tensor = tensor.permute({ 2, 0, 1 }); // HWC -> CHW
|
192 |
+
tensor = tensor.unsqueeze(0);
|
193 |
+
tensor = at::upsample_bilinear2d(tensor, { net_height, net_width }, true); // resize
|
194 |
+
tensor = tensor.squeeze(0);
|
195 |
+
tensor = tensor.permute({ 1, 2, 0 }); // CHW -> HWC
|
196 |
+
|
197 |
+
tensor = tensor.div(255).sub(mean).div(std); // normalization
|
198 |
+
tensor = tensor.permute({ 2, 0, 1 }); // HWC -> CHW
|
199 |
+
tensor.unsqueeze_(0); // CHW -> NCHW
|
200 |
+
}
|
201 |
+
catch (const c10::Error& e)
|
202 |
+
{
|
203 |
+
std::cerr << " pre-processing exception: " << e.msg() << std::endl;
|
204 |
+
return;
|
205 |
+
}
|
206 |
+
|
207 |
+
auto input_to_net = ToInput(tensor); // input to the network
|
208 |
+
|
209 |
+
// inference
|
210 |
+
output;
|
211 |
+
try {
|
212 |
+
output = module.forward(input_to_net).toTensor(); // run inference
|
213 |
+
}
|
214 |
+
catch (const c10::Error& e)
|
215 |
+
{
|
216 |
+
std::cerr << " module.forward() exception: " << e.msg() << std::endl;
|
217 |
+
return;
|
218 |
+
}
|
219 |
+
|
220 |
+
output = output.detach().to(torch::kF32);
|
221 |
+
|
222 |
+
// move to CPU temporary
|
223 |
+
at::Tensor output_tmp = output;
|
224 |
+
output_tmp = output_tmp.to(torch::kCPU);
|
225 |
+
|
226 |
+
// normalization
|
227 |
+
float min_val = std::numeric_limits<float>::max();
|
228 |
+
float max_val = std::numeric_limits<float>::min();
|
229 |
+
|
230 |
+
for (int i = 0; i < net_width * net_height; ++i) {
|
231 |
+
float val = output_tmp.data_ptr<float>()[i];
|
232 |
+
if (min_val > val) min_val = val;
|
233 |
+
if (max_val < val) max_val = val;
|
234 |
+
}
|
235 |
+
float range_val = max_val - min_val;
|
236 |
+
|
237 |
+
output = output.sub(min_val).div(range_val).mul(255.0F).clamp(0, 255).to(torch::kF32); // .to(torch::kU8);
|
238 |
+
|
239 |
+
// resize to the original size if required
|
240 |
+
if (out_orig_size) {
|
241 |
+
try {
|
242 |
+
output = at::upsample_bilinear2d(output.unsqueeze(0), { cv_ptr->image.size().height, cv_ptr->image.size().width }, true);
|
243 |
+
output = output.squeeze(0);
|
244 |
+
}
|
245 |
+
catch (const c10::Error& e)
|
246 |
+
{
|
247 |
+
std::cout << " upsample_bilinear2d() exception: " << e.msg() << std::endl;
|
248 |
+
return;
|
249 |
+
}
|
250 |
+
}
|
251 |
+
output = output.permute({ 1, 2, 0 }).to(torch::kCPU);
|
252 |
+
|
253 |
+
int cv_type = CV_32FC1; // CV_8UC1;
|
254 |
+
auto cv_img = ToCvImage(output, cv_type);
|
255 |
+
|
256 |
+
sensor_msgs::Image img_msg;
|
257 |
+
|
258 |
+
try {
|
259 |
+
// cv::Mat -> sensor_msgs::Image
|
260 |
+
std_msgs::Header header; // empty header
|
261 |
+
header.seq = 0; // user defined counter
|
262 |
+
header.stamp = ros::Time::now();// time
|
263 |
+
//cv_bridge::CvImage img_bridge = cv_bridge::CvImage(header, sensor_msgs::image_encodings::MONO8, cv_img);
|
264 |
+
cv_bridge::CvImage img_bridge = cv_bridge::CvImage(header, sensor_msgs::image_encodings::TYPE_32FC1, cv_img);
|
265 |
+
|
266 |
+
img_bridge.toImageMsg(img_msg); // cv_bridge -> sensor_msgs::Image
|
267 |
+
}
|
268 |
+
catch (cv_bridge::Exception& e)
|
269 |
+
{
|
270 |
+
ROS_ERROR("cv_bridge exception: %s", e.what());
|
271 |
+
return;
|
272 |
+
}
|
273 |
+
|
274 |
+
// Output modified video stream
|
275 |
+
image_pub_.publish(img_msg);
|
276 |
+
}
|
277 |
+
};
|
278 |
+
|
279 |
+
int main(int argc, char** argv)
|
280 |
+
{
|
281 |
+
ros::init(argc, argv, "midas", ros::init_options::AnonymousName);
|
282 |
+
Midas ic;
|
283 |
+
ros::spin();
|
284 |
+
return 0;
|
285 |
+
}
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/run_talker_listener_test.sh
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# place any test.mp4 file near with this file
|
2 |
+
|
3 |
+
# roscore
|
4 |
+
# rosnode kill -a
|
5 |
+
|
6 |
+
source ~/catkin_ws/devel/setup.bash
|
7 |
+
|
8 |
+
roscore &
|
9 |
+
P1=$!
|
10 |
+
rosrun midas_cpp talker.py &
|
11 |
+
P2=$!
|
12 |
+
rosrun midas_cpp listener_original.py &
|
13 |
+
P3=$!
|
14 |
+
rosrun midas_cpp listener.py &
|
15 |
+
P4=$!
|
16 |
+
wait $P1 $P2 $P3 $P4
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/run.py
ADDED
@@ -0,0 +1,277 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Compute depth maps for images in the input folder.
|
2 |
+
"""
|
3 |
+
import os
|
4 |
+
import glob
|
5 |
+
import torch
|
6 |
+
import utils
|
7 |
+
import cv2
|
8 |
+
import argparse
|
9 |
+
import time
|
10 |
+
|
11 |
+
import numpy as np
|
12 |
+
|
13 |
+
from imutils.video import VideoStream
|
14 |
+
from midas.model_loader import default_models, load_model
|
15 |
+
|
16 |
+
first_execution = True
|
17 |
+
def process(device, model, model_type, image, input_size, target_size, optimize, use_camera):
|
18 |
+
"""
|
19 |
+
Run the inference and interpolate.
|
20 |
+
|
21 |
+
Args:
|
22 |
+
device (torch.device): the torch device used
|
23 |
+
model: the model used for inference
|
24 |
+
model_type: the type of the model
|
25 |
+
image: the image fed into the neural network
|
26 |
+
input_size: the size (width, height) of the neural network input (for OpenVINO)
|
27 |
+
target_size: the size (width, height) the neural network output is interpolated to
|
28 |
+
optimize: optimize the model to half-floats on CUDA?
|
29 |
+
use_camera: is the camera used?
|
30 |
+
|
31 |
+
Returns:
|
32 |
+
the prediction
|
33 |
+
"""
|
34 |
+
global first_execution
|
35 |
+
|
36 |
+
if "openvino" in model_type:
|
37 |
+
if first_execution or not use_camera:
|
38 |
+
print(f" Input resized to {input_size[0]}x{input_size[1]} before entering the encoder")
|
39 |
+
first_execution = False
|
40 |
+
|
41 |
+
sample = [np.reshape(image, (1, 3, *input_size))]
|
42 |
+
prediction = model(sample)[model.output(0)][0]
|
43 |
+
prediction = cv2.resize(prediction, dsize=target_size,
|
44 |
+
interpolation=cv2.INTER_CUBIC)
|
45 |
+
else:
|
46 |
+
sample = torch.from_numpy(image).to(device).unsqueeze(0)
|
47 |
+
|
48 |
+
if optimize and device == torch.device("cuda"):
|
49 |
+
if first_execution:
|
50 |
+
print(" Optimization to half-floats activated. Use with caution, because models like Swin require\n"
|
51 |
+
" float precision to work properly and may yield non-finite depth values to some extent for\n"
|
52 |
+
" half-floats.")
|
53 |
+
sample = sample.to(memory_format=torch.channels_last)
|
54 |
+
sample = sample.half()
|
55 |
+
|
56 |
+
if first_execution or not use_camera:
|
57 |
+
height, width = sample.shape[2:]
|
58 |
+
print(f" Input resized to {width}x{height} before entering the encoder")
|
59 |
+
first_execution = False
|
60 |
+
|
61 |
+
prediction = model.forward(sample)
|
62 |
+
prediction = (
|
63 |
+
torch.nn.functional.interpolate(
|
64 |
+
prediction.unsqueeze(1),
|
65 |
+
size=target_size[::-1],
|
66 |
+
mode="bicubic",
|
67 |
+
align_corners=False,
|
68 |
+
)
|
69 |
+
.squeeze()
|
70 |
+
.cpu()
|
71 |
+
.numpy()
|
72 |
+
)
|
73 |
+
|
74 |
+
return prediction
|
75 |
+
|
76 |
+
|
77 |
+
def create_side_by_side(image, depth, grayscale):
|
78 |
+
"""
|
79 |
+
Take an RGB image and depth map and place them side by side. This includes a proper normalization of the depth map
|
80 |
+
for better visibility.
|
81 |
+
|
82 |
+
Args:
|
83 |
+
image: the RGB image
|
84 |
+
depth: the depth map
|
85 |
+
grayscale: use a grayscale colormap?
|
86 |
+
|
87 |
+
Returns:
|
88 |
+
the image and depth map place side by side
|
89 |
+
"""
|
90 |
+
depth_min = depth.min()
|
91 |
+
depth_max = depth.max()
|
92 |
+
normalized_depth = 255 * (depth - depth_min) / (depth_max - depth_min)
|
93 |
+
normalized_depth *= 3
|
94 |
+
|
95 |
+
right_side = np.repeat(np.expand_dims(normalized_depth, 2), 3, axis=2) / 3
|
96 |
+
if not grayscale:
|
97 |
+
right_side = cv2.applyColorMap(np.uint8(right_side), cv2.COLORMAP_INFERNO)
|
98 |
+
|
99 |
+
if image is None:
|
100 |
+
return right_side
|
101 |
+
else:
|
102 |
+
return np.concatenate((image, right_side), axis=1)
|
103 |
+
|
104 |
+
|
105 |
+
def run(input_path, output_path, model_path, model_type="dpt_beit_large_512", optimize=False, side=False, height=None,
|
106 |
+
square=False, grayscale=False):
|
107 |
+
"""Run MonoDepthNN to compute depth maps.
|
108 |
+
|
109 |
+
Args:
|
110 |
+
input_path (str): path to input folder
|
111 |
+
output_path (str): path to output folder
|
112 |
+
model_path (str): path to saved model
|
113 |
+
model_type (str): the model type
|
114 |
+
optimize (bool): optimize the model to half-floats on CUDA?
|
115 |
+
side (bool): RGB and depth side by side in output images?
|
116 |
+
height (int): inference encoder image height
|
117 |
+
square (bool): resize to a square resolution?
|
118 |
+
grayscale (bool): use a grayscale colormap?
|
119 |
+
"""
|
120 |
+
print("Initialize")
|
121 |
+
|
122 |
+
# select device
|
123 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
124 |
+
print("Device: %s" % device)
|
125 |
+
|
126 |
+
model, transform, net_w, net_h = load_model(device, model_path, model_type, optimize, height, square)
|
127 |
+
|
128 |
+
# get input
|
129 |
+
if input_path is not None:
|
130 |
+
image_names = glob.glob(os.path.join(input_path, "*"))
|
131 |
+
num_images = len(image_names)
|
132 |
+
else:
|
133 |
+
print("No input path specified. Grabbing images from camera.")
|
134 |
+
|
135 |
+
# create output folder
|
136 |
+
if output_path is not None:
|
137 |
+
os.makedirs(output_path, exist_ok=True)
|
138 |
+
|
139 |
+
print("Start processing")
|
140 |
+
|
141 |
+
if input_path is not None:
|
142 |
+
if output_path is None:
|
143 |
+
print("Warning: No output path specified. Images will be processed but not shown or stored anywhere.")
|
144 |
+
for index, image_name in enumerate(image_names):
|
145 |
+
|
146 |
+
print(" Processing {} ({}/{})".format(image_name, index + 1, num_images))
|
147 |
+
|
148 |
+
# input
|
149 |
+
original_image_rgb = utils.read_image(image_name) # in [0, 1]
|
150 |
+
image = transform({"image": original_image_rgb})["image"]
|
151 |
+
|
152 |
+
# compute
|
153 |
+
with torch.no_grad():
|
154 |
+
prediction = process(device, model, model_type, image, (net_w, net_h), original_image_rgb.shape[1::-1],
|
155 |
+
optimize, False)
|
156 |
+
|
157 |
+
# output
|
158 |
+
if output_path is not None:
|
159 |
+
filename = os.path.join(
|
160 |
+
output_path, os.path.splitext(os.path.basename(image_name))[0] + '-' + model_type
|
161 |
+
)
|
162 |
+
if not side:
|
163 |
+
utils.write_depth(filename, prediction, grayscale, bits=2)
|
164 |
+
else:
|
165 |
+
original_image_bgr = np.flip(original_image_rgb, 2)
|
166 |
+
content = create_side_by_side(original_image_bgr*255, prediction, grayscale)
|
167 |
+
cv2.imwrite(filename + ".png", content)
|
168 |
+
utils.write_pfm(filename + ".pfm", prediction.astype(np.float32))
|
169 |
+
|
170 |
+
else:
|
171 |
+
with torch.no_grad():
|
172 |
+
fps = 1
|
173 |
+
video = VideoStream(0).start()
|
174 |
+
time_start = time.time()
|
175 |
+
frame_index = 0
|
176 |
+
while True:
|
177 |
+
frame = video.read()
|
178 |
+
if frame is not None:
|
179 |
+
original_image_rgb = np.flip(frame, 2) # in [0, 255] (flip required to get RGB)
|
180 |
+
image = transform({"image": original_image_rgb/255})["image"]
|
181 |
+
|
182 |
+
prediction = process(device, model, model_type, image, (net_w, net_h),
|
183 |
+
original_image_rgb.shape[1::-1], optimize, True)
|
184 |
+
|
185 |
+
original_image_bgr = np.flip(original_image_rgb, 2) if side else None
|
186 |
+
content = create_side_by_side(original_image_bgr, prediction, grayscale)
|
187 |
+
cv2.imshow('MiDaS Depth Estimation - Press Escape to close window ', content/255)
|
188 |
+
|
189 |
+
if output_path is not None:
|
190 |
+
filename = os.path.join(output_path, 'Camera' + '-' + model_type + '_' + str(frame_index))
|
191 |
+
cv2.imwrite(filename + ".png", content)
|
192 |
+
|
193 |
+
alpha = 0.1
|
194 |
+
if time.time()-time_start > 0:
|
195 |
+
fps = (1 - alpha) * fps + alpha * 1 / (time.time()-time_start) # exponential moving average
|
196 |
+
time_start = time.time()
|
197 |
+
print(f"\rFPS: {round(fps,2)}", end="")
|
198 |
+
|
199 |
+
if cv2.waitKey(1) == 27: # Escape key
|
200 |
+
break
|
201 |
+
|
202 |
+
frame_index += 1
|
203 |
+
print()
|
204 |
+
|
205 |
+
print("Finished")
|
206 |
+
|
207 |
+
|
208 |
+
if __name__ == "__main__":
|
209 |
+
parser = argparse.ArgumentParser()
|
210 |
+
|
211 |
+
parser.add_argument('-i', '--input_path',
|
212 |
+
default=None,
|
213 |
+
help='Folder with input images (if no input path is specified, images are tried to be grabbed '
|
214 |
+
'from camera)'
|
215 |
+
)
|
216 |
+
|
217 |
+
parser.add_argument('-o', '--output_path',
|
218 |
+
default=None,
|
219 |
+
help='Folder for output images'
|
220 |
+
)
|
221 |
+
|
222 |
+
parser.add_argument('-m', '--model_weights',
|
223 |
+
default=None,
|
224 |
+
help='Path to the trained weights of model'
|
225 |
+
)
|
226 |
+
|
227 |
+
parser.add_argument('-t', '--model_type',
|
228 |
+
default='dpt_beit_large_512',
|
229 |
+
help='Model type: '
|
230 |
+
'dpt_beit_large_512, dpt_beit_large_384, dpt_beit_base_384, dpt_swin2_large_384, '
|
231 |
+
'dpt_swin2_base_384, dpt_swin2_tiny_256, dpt_swin_large_384, dpt_next_vit_large_384, '
|
232 |
+
'dpt_levit_224, dpt_large_384, dpt_hybrid_384, midas_v21_384, midas_v21_small_256 or '
|
233 |
+
'openvino_midas_v21_small_256'
|
234 |
+
)
|
235 |
+
|
236 |
+
parser.add_argument('-s', '--side',
|
237 |
+
action='store_true',
|
238 |
+
help='Output images contain RGB and depth images side by side'
|
239 |
+
)
|
240 |
+
|
241 |
+
parser.add_argument('--optimize', dest='optimize', action='store_true', help='Use half-float optimization')
|
242 |
+
parser.set_defaults(optimize=False)
|
243 |
+
|
244 |
+
parser.add_argument('--height',
|
245 |
+
type=int, default=None,
|
246 |
+
help='Preferred height of images feed into the encoder during inference. Note that the '
|
247 |
+
'preferred height may differ from the actual height, because an alignment to multiples of '
|
248 |
+
'32 takes place. Many models support only the height chosen during training, which is '
|
249 |
+
'used automatically if this parameter is not set.'
|
250 |
+
)
|
251 |
+
parser.add_argument('--square',
|
252 |
+
action='store_true',
|
253 |
+
help='Option to resize images to a square resolution by changing their widths when images are '
|
254 |
+
'fed into the encoder during inference. If this parameter is not set, the aspect ratio of '
|
255 |
+
'images is tried to be preserved if supported by the model.'
|
256 |
+
)
|
257 |
+
parser.add_argument('--grayscale',
|
258 |
+
action='store_true',
|
259 |
+
help='Use a grayscale colormap instead of the inferno one. Although the inferno colormap, '
|
260 |
+
'which is used by default, is better for visibility, it does not allow storing 16-bit '
|
261 |
+
'depth values in PNGs but only 8-bit ones due to the precision limitation of this '
|
262 |
+
'colormap.'
|
263 |
+
)
|
264 |
+
|
265 |
+
args = parser.parse_args()
|
266 |
+
|
267 |
+
|
268 |
+
if args.model_weights is None:
|
269 |
+
args.model_weights = default_models[args.model_type]
|
270 |
+
|
271 |
+
# set torch options
|
272 |
+
torch.backends.cudnn.enabled = True
|
273 |
+
torch.backends.cudnn.benchmark = True
|
274 |
+
|
275 |
+
# compute depth maps
|
276 |
+
run(args.input_path, args.output_path, args.model_weights, args.model_type, args.optimize, args.side, args.height,
|
277 |
+
args.square, args.grayscale)
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/tf/README.md
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer
|
2 |
+
|
3 |
+
### TensorFlow inference using `.pb` and `.onnx` models
|
4 |
+
|
5 |
+
1. [Run inference on TensorFlow-model by using TensorFlow](#run-inference-on-tensorflow-model-by-using-tensorFlow)
|
6 |
+
|
7 |
+
2. [Run inference on ONNX-model by using TensorFlow](#run-inference-on-onnx-model-by-using-tensorflow)
|
8 |
+
|
9 |
+
3. [Make ONNX model from downloaded Pytorch model file](#make-onnx-model-from-downloaded-pytorch-model-file)
|
10 |
+
|
11 |
+
|
12 |
+
### Run inference on TensorFlow-model by using TensorFlow
|
13 |
+
|
14 |
+
1) Download the model weights [model-f6b98070.pb](https://github.com/isl-org/MiDaS/releases/download/v2_1/model-f6b98070.pb)
|
15 |
+
and [model-small.pb](https://github.com/isl-org/MiDaS/releases/download/v2_1/model-small.pb) and place the
|
16 |
+
file in the `/tf/` folder.
|
17 |
+
|
18 |
+
2) Set up dependencies:
|
19 |
+
|
20 |
+
```shell
|
21 |
+
# install OpenCV
|
22 |
+
pip install --upgrade pip
|
23 |
+
pip install opencv-python
|
24 |
+
|
25 |
+
# install TensorFlow
|
26 |
+
pip install -I grpcio tensorflow==2.3.0 tensorflow-addons==0.11.2 numpy==1.18.0
|
27 |
+
```
|
28 |
+
|
29 |
+
#### Usage
|
30 |
+
|
31 |
+
1) Place one or more input images in the folder `tf/input`.
|
32 |
+
|
33 |
+
2) Run the model:
|
34 |
+
|
35 |
+
```shell
|
36 |
+
python tf/run_pb.py
|
37 |
+
```
|
38 |
+
|
39 |
+
Or run the small model:
|
40 |
+
|
41 |
+
```shell
|
42 |
+
python tf/run_pb.py --model_weights model-small.pb --model_type small
|
43 |
+
```
|
44 |
+
|
45 |
+
3) The resulting inverse depth maps are written to the `tf/output` folder.
|
46 |
+
|
47 |
+
|
48 |
+
### Run inference on ONNX-model by using ONNX-Runtime
|
49 |
+
|
50 |
+
1) Download the model weights [model-f6b98070.onnx](https://github.com/isl-org/MiDaS/releases/download/v2_1/model-f6b98070.onnx)
|
51 |
+
and [model-small.onnx](https://github.com/isl-org/MiDaS/releases/download/v2_1/model-small.onnx) and place the
|
52 |
+
file in the `/tf/` folder.
|
53 |
+
|
54 |
+
2) Set up dependencies:
|
55 |
+
|
56 |
+
```shell
|
57 |
+
# install OpenCV
|
58 |
+
pip install --upgrade pip
|
59 |
+
pip install opencv-python
|
60 |
+
|
61 |
+
# install ONNX
|
62 |
+
pip install onnx==1.7.0
|
63 |
+
|
64 |
+
# install ONNX Runtime
|
65 |
+
pip install onnxruntime==1.5.2
|
66 |
+
```
|
67 |
+
|
68 |
+
#### Usage
|
69 |
+
|
70 |
+
1) Place one or more input images in the folder `tf/input`.
|
71 |
+
|
72 |
+
2) Run the model:
|
73 |
+
|
74 |
+
```shell
|
75 |
+
python tf/run_onnx.py
|
76 |
+
```
|
77 |
+
|
78 |
+
Or run the small model:
|
79 |
+
|
80 |
+
```shell
|
81 |
+
python tf/run_onnx.py --model_weights model-small.onnx --model_type small
|
82 |
+
```
|
83 |
+
|
84 |
+
3) The resulting inverse depth maps are written to the `tf/output` folder.
|
85 |
+
|
86 |
+
|
87 |
+
|
88 |
+
### Make ONNX model from downloaded Pytorch model file
|
89 |
+
|
90 |
+
1) Download the model weights [model-f6b98070.pt](https://github.com/isl-org/MiDaS/releases/download/v2_1/model-f6b98070.pt) and place the
|
91 |
+
file in the root folder.
|
92 |
+
|
93 |
+
2) Set up dependencies:
|
94 |
+
|
95 |
+
```shell
|
96 |
+
# install OpenCV
|
97 |
+
pip install --upgrade pip
|
98 |
+
pip install opencv-python
|
99 |
+
|
100 |
+
# install PyTorch TorchVision
|
101 |
+
pip install -I torch==1.7.0 torchvision==0.8.0
|
102 |
+
|
103 |
+
# install TensorFlow
|
104 |
+
pip install -I grpcio tensorflow==2.3.0 tensorflow-addons==0.11.2 numpy==1.18.0
|
105 |
+
|
106 |
+
# install ONNX
|
107 |
+
pip install onnx==1.7.0
|
108 |
+
|
109 |
+
# install ONNX-TensorFlow
|
110 |
+
git clone https://github.com/onnx/onnx-tensorflow.git
|
111 |
+
cd onnx-tensorflow
|
112 |
+
git checkout 095b51b88e35c4001d70f15f80f31014b592b81e
|
113 |
+
pip install -e .
|
114 |
+
```
|
115 |
+
|
116 |
+
#### Usage
|
117 |
+
|
118 |
+
1) Run the converter:
|
119 |
+
|
120 |
+
```shell
|
121 |
+
python tf/make_onnx_model.py
|
122 |
+
```
|
123 |
+
|
124 |
+
2) The resulting `model-f6b98070.onnx` file is written to the `/tf/` folder.
|
125 |
+
|
126 |
+
|
127 |
+
### Requirements
|
128 |
+
|
129 |
+
The code was tested with Python 3.6.9, PyTorch 1.5.1, TensorFlow 2.2.0, TensorFlow-addons 0.8.3, ONNX 1.7.0, ONNX-TensorFlow (GitHub-master-17.07.2020) and OpenCV 4.3.0.
|
130 |
+
|
131 |
+
### Citation
|
132 |
+
|
133 |
+
Please cite our paper if you use this code or any of the models:
|
134 |
+
```
|
135 |
+
@article{Ranftl2019,
|
136 |
+
author = {Ren\'{e} Ranftl and Katrin Lasinger and David Hafner and Konrad Schindler and Vladlen Koltun},
|
137 |
+
title = {Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer},
|
138 |
+
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)},
|
139 |
+
year = {2020},
|
140 |
+
}
|
141 |
+
```
|
142 |
+
|
143 |
+
### License
|
144 |
+
|
145 |
+
MIT License
|
146 |
+
|
147 |
+
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/tf/input/.placeholder
ADDED
File without changes
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/tf/make_onnx_model.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Compute depth maps for images in the input folder.
|
2 |
+
"""
|
3 |
+
import os
|
4 |
+
import ntpath
|
5 |
+
import glob
|
6 |
+
import torch
|
7 |
+
import utils
|
8 |
+
import cv2
|
9 |
+
import numpy as np
|
10 |
+
from torchvision.transforms import Compose, Normalize
|
11 |
+
from torchvision import transforms
|
12 |
+
|
13 |
+
from shutil import copyfile
|
14 |
+
import fileinput
|
15 |
+
import sys
|
16 |
+
sys.path.append(os.getcwd() + '/..')
|
17 |
+
|
18 |
+
def modify_file():
|
19 |
+
modify_filename = '../midas/blocks.py'
|
20 |
+
copyfile(modify_filename, modify_filename+'.bak')
|
21 |
+
|
22 |
+
with open(modify_filename, 'r') as file :
|
23 |
+
filedata = file.read()
|
24 |
+
|
25 |
+
filedata = filedata.replace('align_corners=True', 'align_corners=False')
|
26 |
+
filedata = filedata.replace('import torch.nn as nn', 'import torch.nn as nn\nimport torchvision.models as models')
|
27 |
+
filedata = filedata.replace('torch.hub.load("facebookresearch/WSL-Images", "resnext101_32x8d_wsl")', 'models.resnext101_32x8d()')
|
28 |
+
|
29 |
+
with open(modify_filename, 'w') as file:
|
30 |
+
file.write(filedata)
|
31 |
+
|
32 |
+
def restore_file():
|
33 |
+
modify_filename = '../midas/blocks.py'
|
34 |
+
copyfile(modify_filename+'.bak', modify_filename)
|
35 |
+
|
36 |
+
modify_file()
|
37 |
+
|
38 |
+
from midas.midas_net import MidasNet
|
39 |
+
from midas.transforms import Resize, NormalizeImage, PrepareForNet
|
40 |
+
|
41 |
+
restore_file()
|
42 |
+
|
43 |
+
|
44 |
+
class MidasNet_preprocessing(MidasNet):
|
45 |
+
"""Network for monocular depth estimation.
|
46 |
+
"""
|
47 |
+
def forward(self, x):
|
48 |
+
"""Forward pass.
|
49 |
+
|
50 |
+
Args:
|
51 |
+
x (tensor): input data (image)
|
52 |
+
|
53 |
+
Returns:
|
54 |
+
tensor: depth
|
55 |
+
"""
|
56 |
+
|
57 |
+
mean = torch.tensor([0.485, 0.456, 0.406])
|
58 |
+
std = torch.tensor([0.229, 0.224, 0.225])
|
59 |
+
x.sub_(mean[None, :, None, None]).div_(std[None, :, None, None])
|
60 |
+
|
61 |
+
return MidasNet.forward(self, x)
|
62 |
+
|
63 |
+
|
64 |
+
def run(model_path):
|
65 |
+
"""Run MonoDepthNN to compute depth maps.
|
66 |
+
|
67 |
+
Args:
|
68 |
+
model_path (str): path to saved model
|
69 |
+
"""
|
70 |
+
print("initialize")
|
71 |
+
|
72 |
+
# select device
|
73 |
+
|
74 |
+
# load network
|
75 |
+
#model = MidasNet(model_path, non_negative=True)
|
76 |
+
model = MidasNet_preprocessing(model_path, non_negative=True)
|
77 |
+
|
78 |
+
model.eval()
|
79 |
+
|
80 |
+
print("start processing")
|
81 |
+
|
82 |
+
# input
|
83 |
+
img_input = np.zeros((3, 384, 384), np.float32)
|
84 |
+
|
85 |
+
# compute
|
86 |
+
with torch.no_grad():
|
87 |
+
sample = torch.from_numpy(img_input).unsqueeze(0)
|
88 |
+
prediction = model.forward(sample)
|
89 |
+
prediction = (
|
90 |
+
torch.nn.functional.interpolate(
|
91 |
+
prediction.unsqueeze(1),
|
92 |
+
size=img_input.shape[:2],
|
93 |
+
mode="bicubic",
|
94 |
+
align_corners=False,
|
95 |
+
)
|
96 |
+
.squeeze()
|
97 |
+
.cpu()
|
98 |
+
.numpy()
|
99 |
+
)
|
100 |
+
|
101 |
+
torch.onnx.export(model, sample, ntpath.basename(model_path).rsplit('.', 1)[0]+'.onnx', opset_version=9)
|
102 |
+
|
103 |
+
print("finished")
|
104 |
+
|
105 |
+
|
106 |
+
if __name__ == "__main__":
|
107 |
+
# set paths
|
108 |
+
# MODEL_PATH = "model.pt"
|
109 |
+
MODEL_PATH = "../model-f6b98070.pt"
|
110 |
+
|
111 |
+
# compute depth maps
|
112 |
+
run(MODEL_PATH)
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/tf/output/.placeholder
ADDED
File without changes
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/tf/run_onnx.py
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Compute depth maps for images in the input folder.
|
2 |
+
"""
|
3 |
+
import os
|
4 |
+
import glob
|
5 |
+
import utils
|
6 |
+
import cv2
|
7 |
+
import sys
|
8 |
+
import numpy as np
|
9 |
+
import argparse
|
10 |
+
|
11 |
+
import onnx
|
12 |
+
import onnxruntime as rt
|
13 |
+
|
14 |
+
from transforms import Resize, NormalizeImage, PrepareForNet
|
15 |
+
|
16 |
+
|
17 |
+
def run(input_path, output_path, model_path, model_type="large"):
|
18 |
+
"""Run MonoDepthNN to compute depth maps.
|
19 |
+
|
20 |
+
Args:
|
21 |
+
input_path (str): path to input folder
|
22 |
+
output_path (str): path to output folder
|
23 |
+
model_path (str): path to saved model
|
24 |
+
"""
|
25 |
+
print("initialize")
|
26 |
+
|
27 |
+
# select device
|
28 |
+
device = "CUDA:0"
|
29 |
+
#device = "CPU"
|
30 |
+
print("device: %s" % device)
|
31 |
+
|
32 |
+
# network resolution
|
33 |
+
if model_type == "large":
|
34 |
+
net_w, net_h = 384, 384
|
35 |
+
elif model_type == "small":
|
36 |
+
net_w, net_h = 256, 256
|
37 |
+
else:
|
38 |
+
print(f"model_type '{model_type}' not implemented, use: --model_type large")
|
39 |
+
assert False
|
40 |
+
|
41 |
+
# load network
|
42 |
+
print("loading model...")
|
43 |
+
model = rt.InferenceSession(model_path)
|
44 |
+
input_name = model.get_inputs()[0].name
|
45 |
+
output_name = model.get_outputs()[0].name
|
46 |
+
|
47 |
+
resize_image = Resize(
|
48 |
+
net_w,
|
49 |
+
net_h,
|
50 |
+
resize_target=None,
|
51 |
+
keep_aspect_ratio=False,
|
52 |
+
ensure_multiple_of=32,
|
53 |
+
resize_method="upper_bound",
|
54 |
+
image_interpolation_method=cv2.INTER_CUBIC,
|
55 |
+
)
|
56 |
+
|
57 |
+
def compose2(f1, f2):
|
58 |
+
return lambda x: f2(f1(x))
|
59 |
+
|
60 |
+
transform = compose2(resize_image, PrepareForNet())
|
61 |
+
|
62 |
+
# get input
|
63 |
+
img_names = glob.glob(os.path.join(input_path, "*"))
|
64 |
+
num_images = len(img_names)
|
65 |
+
|
66 |
+
# create output folder
|
67 |
+
os.makedirs(output_path, exist_ok=True)
|
68 |
+
|
69 |
+
print("start processing")
|
70 |
+
|
71 |
+
for ind, img_name in enumerate(img_names):
|
72 |
+
|
73 |
+
print(" processing {} ({}/{})".format(img_name, ind + 1, num_images))
|
74 |
+
|
75 |
+
# input
|
76 |
+
img = utils.read_image(img_name)
|
77 |
+
img_input = transform({"image": img})["image"]
|
78 |
+
|
79 |
+
# compute
|
80 |
+
output = model.run([output_name], {input_name: img_input.reshape(1, 3, net_h, net_w).astype(np.float32)})[0]
|
81 |
+
prediction = np.array(output).reshape(net_h, net_w)
|
82 |
+
prediction = cv2.resize(prediction, (img.shape[1], img.shape[0]), interpolation=cv2.INTER_CUBIC)
|
83 |
+
|
84 |
+
# output
|
85 |
+
filename = os.path.join(
|
86 |
+
output_path, os.path.splitext(os.path.basename(img_name))[0]
|
87 |
+
)
|
88 |
+
utils.write_depth(filename, prediction, bits=2)
|
89 |
+
|
90 |
+
print("finished")
|
91 |
+
|
92 |
+
|
93 |
+
if __name__ == "__main__":
|
94 |
+
parser = argparse.ArgumentParser()
|
95 |
+
|
96 |
+
parser.add_argument('-i', '--input_path',
|
97 |
+
default='input',
|
98 |
+
help='folder with input images'
|
99 |
+
)
|
100 |
+
|
101 |
+
parser.add_argument('-o', '--output_path',
|
102 |
+
default='output',
|
103 |
+
help='folder for output images'
|
104 |
+
)
|
105 |
+
|
106 |
+
parser.add_argument('-m', '--model_weights',
|
107 |
+
default='model-f6b98070.onnx',
|
108 |
+
help='path to the trained weights of model'
|
109 |
+
)
|
110 |
+
|
111 |
+
parser.add_argument('-t', '--model_type',
|
112 |
+
default='large',
|
113 |
+
help='model type: large or small'
|
114 |
+
)
|
115 |
+
|
116 |
+
args = parser.parse_args()
|
117 |
+
|
118 |
+
# compute depth maps
|
119 |
+
run(args.input_path, args.output_path, args.model_weights, args.model_type)
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/tf/run_pb.py
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Compute depth maps for images in the input folder.
|
2 |
+
"""
|
3 |
+
import os
|
4 |
+
import glob
|
5 |
+
import utils
|
6 |
+
import cv2
|
7 |
+
import argparse
|
8 |
+
|
9 |
+
import tensorflow as tf
|
10 |
+
|
11 |
+
from transforms import Resize, NormalizeImage, PrepareForNet
|
12 |
+
|
13 |
+
def run(input_path, output_path, model_path, model_type="large"):
|
14 |
+
"""Run MonoDepthNN to compute depth maps.
|
15 |
+
|
16 |
+
Args:
|
17 |
+
input_path (str): path to input folder
|
18 |
+
output_path (str): path to output folder
|
19 |
+
model_path (str): path to saved model
|
20 |
+
"""
|
21 |
+
print("initialize")
|
22 |
+
|
23 |
+
# the runtime initialization will not allocate all memory on the device to avoid out of GPU memory
|
24 |
+
gpus = tf.config.experimental.list_physical_devices('GPU')
|
25 |
+
if gpus:
|
26 |
+
try:
|
27 |
+
for gpu in gpus:
|
28 |
+
#tf.config.experimental.set_memory_growth(gpu, True)
|
29 |
+
tf.config.experimental.set_virtual_device_configuration(gpu,
|
30 |
+
[tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4000)])
|
31 |
+
except RuntimeError as e:
|
32 |
+
print(e)
|
33 |
+
|
34 |
+
# network resolution
|
35 |
+
if model_type == "large":
|
36 |
+
net_w, net_h = 384, 384
|
37 |
+
elif model_type == "small":
|
38 |
+
net_w, net_h = 256, 256
|
39 |
+
else:
|
40 |
+
print(f"model_type '{model_type}' not implemented, use: --model_type large")
|
41 |
+
assert False
|
42 |
+
|
43 |
+
# load network
|
44 |
+
graph_def = tf.compat.v1.GraphDef()
|
45 |
+
with tf.io.gfile.GFile(model_path, 'rb') as f:
|
46 |
+
graph_def.ParseFromString(f.read())
|
47 |
+
tf.import_graph_def(graph_def, name='')
|
48 |
+
|
49 |
+
|
50 |
+
model_operations = tf.compat.v1.get_default_graph().get_operations()
|
51 |
+
input_node = '0:0'
|
52 |
+
output_layer = model_operations[len(model_operations) - 1].name + ':0'
|
53 |
+
print("Last layer name: ", output_layer)
|
54 |
+
|
55 |
+
resize_image = Resize(
|
56 |
+
net_w,
|
57 |
+
net_h,
|
58 |
+
resize_target=None,
|
59 |
+
keep_aspect_ratio=False,
|
60 |
+
ensure_multiple_of=32,
|
61 |
+
resize_method="upper_bound",
|
62 |
+
image_interpolation_method=cv2.INTER_CUBIC,
|
63 |
+
)
|
64 |
+
|
65 |
+
def compose2(f1, f2):
|
66 |
+
return lambda x: f2(f1(x))
|
67 |
+
|
68 |
+
transform = compose2(resize_image, PrepareForNet())
|
69 |
+
|
70 |
+
# get input
|
71 |
+
img_names = glob.glob(os.path.join(input_path, "*"))
|
72 |
+
num_images = len(img_names)
|
73 |
+
|
74 |
+
# create output folder
|
75 |
+
os.makedirs(output_path, exist_ok=True)
|
76 |
+
|
77 |
+
print("start processing")
|
78 |
+
|
79 |
+
with tf.compat.v1.Session() as sess:
|
80 |
+
try:
|
81 |
+
# load images
|
82 |
+
for ind, img_name in enumerate(img_names):
|
83 |
+
|
84 |
+
print(" processing {} ({}/{})".format(img_name, ind + 1, num_images))
|
85 |
+
|
86 |
+
# input
|
87 |
+
img = utils.read_image(img_name)
|
88 |
+
img_input = transform({"image": img})["image"]
|
89 |
+
|
90 |
+
# compute
|
91 |
+
prob_tensor = sess.graph.get_tensor_by_name(output_layer)
|
92 |
+
prediction, = sess.run(prob_tensor, {input_node: [img_input] })
|
93 |
+
prediction = prediction.reshape(net_h, net_w)
|
94 |
+
prediction = cv2.resize(prediction, (img.shape[1], img.shape[0]), interpolation=cv2.INTER_CUBIC)
|
95 |
+
|
96 |
+
# output
|
97 |
+
filename = os.path.join(
|
98 |
+
output_path, os.path.splitext(os.path.basename(img_name))[0]
|
99 |
+
)
|
100 |
+
utils.write_depth(filename, prediction, bits=2)
|
101 |
+
|
102 |
+
except KeyError:
|
103 |
+
print ("Couldn't find input node: ' + input_node + ' or output layer: " + output_layer + ".")
|
104 |
+
exit(-1)
|
105 |
+
|
106 |
+
print("finished")
|
107 |
+
|
108 |
+
|
109 |
+
if __name__ == "__main__":
|
110 |
+
parser = argparse.ArgumentParser()
|
111 |
+
|
112 |
+
parser.add_argument('-i', '--input_path',
|
113 |
+
default='input',
|
114 |
+
help='folder with input images'
|
115 |
+
)
|
116 |
+
|
117 |
+
parser.add_argument('-o', '--output_path',
|
118 |
+
default='output',
|
119 |
+
help='folder for output images'
|
120 |
+
)
|
121 |
+
|
122 |
+
parser.add_argument('-m', '--model_weights',
|
123 |
+
default='model-f6b98070.pb',
|
124 |
+
help='path to the trained weights of model'
|
125 |
+
)
|
126 |
+
|
127 |
+
parser.add_argument('-t', '--model_type',
|
128 |
+
default='large',
|
129 |
+
help='model type: large or small'
|
130 |
+
)
|
131 |
+
|
132 |
+
args = parser.parse_args()
|
133 |
+
|
134 |
+
# compute depth maps
|
135 |
+
run(args.input_path, args.output_path, args.model_weights, args.model_type)
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/tf/transforms.py
ADDED
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import cv2
|
3 |
+
import math
|
4 |
+
|
5 |
+
|
6 |
+
def apply_min_size(sample, size, image_interpolation_method=cv2.INTER_AREA):
|
7 |
+
"""Rezise the sample to ensure the given size. Keeps aspect ratio.
|
8 |
+
|
9 |
+
Args:
|
10 |
+
sample (dict): sample
|
11 |
+
size (tuple): image size
|
12 |
+
|
13 |
+
Returns:
|
14 |
+
tuple: new size
|
15 |
+
"""
|
16 |
+
shape = list(sample["disparity"].shape)
|
17 |
+
|
18 |
+
if shape[0] >= size[0] and shape[1] >= size[1]:
|
19 |
+
return sample
|
20 |
+
|
21 |
+
scale = [0, 0]
|
22 |
+
scale[0] = size[0] / shape[0]
|
23 |
+
scale[1] = size[1] / shape[1]
|
24 |
+
|
25 |
+
scale = max(scale)
|
26 |
+
|
27 |
+
shape[0] = math.ceil(scale * shape[0])
|
28 |
+
shape[1] = math.ceil(scale * shape[1])
|
29 |
+
|
30 |
+
# resize
|
31 |
+
sample["image"] = cv2.resize(
|
32 |
+
sample["image"], tuple(shape[::-1]), interpolation=image_interpolation_method
|
33 |
+
)
|
34 |
+
|
35 |
+
sample["disparity"] = cv2.resize(
|
36 |
+
sample["disparity"], tuple(shape[::-1]), interpolation=cv2.INTER_NEAREST
|
37 |
+
)
|
38 |
+
sample["mask"] = cv2.resize(
|
39 |
+
sample["mask"].astype(np.float32),
|
40 |
+
tuple(shape[::-1]),
|
41 |
+
interpolation=cv2.INTER_NEAREST,
|
42 |
+
)
|
43 |
+
sample["mask"] = sample["mask"].astype(bool)
|
44 |
+
|
45 |
+
return tuple(shape)
|
46 |
+
|
47 |
+
|
48 |
+
class Resize(object):
|
49 |
+
"""Resize sample to given size (width, height).
|
50 |
+
"""
|
51 |
+
|
52 |
+
def __init__(
|
53 |
+
self,
|
54 |
+
width,
|
55 |
+
height,
|
56 |
+
resize_target=True,
|
57 |
+
keep_aspect_ratio=False,
|
58 |
+
ensure_multiple_of=1,
|
59 |
+
resize_method="lower_bound",
|
60 |
+
image_interpolation_method=cv2.INTER_AREA,
|
61 |
+
):
|
62 |
+
"""Init.
|
63 |
+
|
64 |
+
Args:
|
65 |
+
width (int): desired output width
|
66 |
+
height (int): desired output height
|
67 |
+
resize_target (bool, optional):
|
68 |
+
True: Resize the full sample (image, mask, target).
|
69 |
+
False: Resize image only.
|
70 |
+
Defaults to True.
|
71 |
+
keep_aspect_ratio (bool, optional):
|
72 |
+
True: Keep the aspect ratio of the input sample.
|
73 |
+
Output sample might not have the given width and height, and
|
74 |
+
resize behaviour depends on the parameter 'resize_method'.
|
75 |
+
Defaults to False.
|
76 |
+
ensure_multiple_of (int, optional):
|
77 |
+
Output width and height is constrained to be multiple of this parameter.
|
78 |
+
Defaults to 1.
|
79 |
+
resize_method (str, optional):
|
80 |
+
"lower_bound": Output will be at least as large as the given size.
|
81 |
+
"upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
|
82 |
+
"minimal": Scale as least as possible. (Output size might be smaller than given size.)
|
83 |
+
Defaults to "lower_bound".
|
84 |
+
"""
|
85 |
+
self.__width = width
|
86 |
+
self.__height = height
|
87 |
+
|
88 |
+
self.__resize_target = resize_target
|
89 |
+
self.__keep_aspect_ratio = keep_aspect_ratio
|
90 |
+
self.__multiple_of = ensure_multiple_of
|
91 |
+
self.__resize_method = resize_method
|
92 |
+
self.__image_interpolation_method = image_interpolation_method
|
93 |
+
|
94 |
+
def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
|
95 |
+
y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
|
96 |
+
|
97 |
+
if max_val is not None and y > max_val:
|
98 |
+
y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int)
|
99 |
+
|
100 |
+
if y < min_val:
|
101 |
+
y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int)
|
102 |
+
|
103 |
+
return y
|
104 |
+
|
105 |
+
def get_size(self, width, height):
|
106 |
+
# determine new height and width
|
107 |
+
scale_height = self.__height / height
|
108 |
+
scale_width = self.__width / width
|
109 |
+
|
110 |
+
if self.__keep_aspect_ratio:
|
111 |
+
if self.__resize_method == "lower_bound":
|
112 |
+
# scale such that output size is lower bound
|
113 |
+
if scale_width > scale_height:
|
114 |
+
# fit width
|
115 |
+
scale_height = scale_width
|
116 |
+
else:
|
117 |
+
# fit height
|
118 |
+
scale_width = scale_height
|
119 |
+
elif self.__resize_method == "upper_bound":
|
120 |
+
# scale such that output size is upper bound
|
121 |
+
if scale_width < scale_height:
|
122 |
+
# fit width
|
123 |
+
scale_height = scale_width
|
124 |
+
else:
|
125 |
+
# fit height
|
126 |
+
scale_width = scale_height
|
127 |
+
elif self.__resize_method == "minimal":
|
128 |
+
# scale as least as possbile
|
129 |
+
if abs(1 - scale_width) < abs(1 - scale_height):
|
130 |
+
# fit width
|
131 |
+
scale_height = scale_width
|
132 |
+
else:
|
133 |
+
# fit height
|
134 |
+
scale_width = scale_height
|
135 |
+
else:
|
136 |
+
raise ValueError(
|
137 |
+
f"resize_method {self.__resize_method} not implemented"
|
138 |
+
)
|
139 |
+
|
140 |
+
if self.__resize_method == "lower_bound":
|
141 |
+
new_height = self.constrain_to_multiple_of(
|
142 |
+
scale_height * height, min_val=self.__height
|
143 |
+
)
|
144 |
+
new_width = self.constrain_to_multiple_of(
|
145 |
+
scale_width * width, min_val=self.__width
|
146 |
+
)
|
147 |
+
elif self.__resize_method == "upper_bound":
|
148 |
+
new_height = self.constrain_to_multiple_of(
|
149 |
+
scale_height * height, max_val=self.__height
|
150 |
+
)
|
151 |
+
new_width = self.constrain_to_multiple_of(
|
152 |
+
scale_width * width, max_val=self.__width
|
153 |
+
)
|
154 |
+
elif self.__resize_method == "minimal":
|
155 |
+
new_height = self.constrain_to_multiple_of(scale_height * height)
|
156 |
+
new_width = self.constrain_to_multiple_of(scale_width * width)
|
157 |
+
else:
|
158 |
+
raise ValueError(f"resize_method {self.__resize_method} not implemented")
|
159 |
+
|
160 |
+
return (new_width, new_height)
|
161 |
+
|
162 |
+
def __call__(self, sample):
|
163 |
+
width, height = self.get_size(
|
164 |
+
sample["image"].shape[1], sample["image"].shape[0]
|
165 |
+
)
|
166 |
+
|
167 |
+
# resize sample
|
168 |
+
sample["image"] = cv2.resize(
|
169 |
+
sample["image"],
|
170 |
+
(width, height),
|
171 |
+
interpolation=self.__image_interpolation_method,
|
172 |
+
)
|
173 |
+
|
174 |
+
if self.__resize_target:
|
175 |
+
if "disparity" in sample:
|
176 |
+
sample["disparity"] = cv2.resize(
|
177 |
+
sample["disparity"],
|
178 |
+
(width, height),
|
179 |
+
interpolation=cv2.INTER_NEAREST,
|
180 |
+
)
|
181 |
+
|
182 |
+
if "depth" in sample:
|
183 |
+
sample["depth"] = cv2.resize(
|
184 |
+
sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST
|
185 |
+
)
|
186 |
+
|
187 |
+
sample["mask"] = cv2.resize(
|
188 |
+
sample["mask"].astype(np.float32),
|
189 |
+
(width, height),
|
190 |
+
interpolation=cv2.INTER_NEAREST,
|
191 |
+
)
|
192 |
+
sample["mask"] = sample["mask"].astype(bool)
|
193 |
+
|
194 |
+
return sample
|
195 |
+
|
196 |
+
|
197 |
+
class NormalizeImage(object):
|
198 |
+
"""Normlize image by given mean and std.
|
199 |
+
"""
|
200 |
+
|
201 |
+
def __init__(self, mean, std):
|
202 |
+
self.__mean = mean
|
203 |
+
self.__std = std
|
204 |
+
|
205 |
+
def __call__(self, sample):
|
206 |
+
sample["image"] = (sample["image"] - self.__mean) / self.__std
|
207 |
+
|
208 |
+
return sample
|
209 |
+
|
210 |
+
|
211 |
+
class PrepareForNet(object):
|
212 |
+
"""Prepare sample for usage as network input.
|
213 |
+
"""
|
214 |
+
|
215 |
+
def __init__(self):
|
216 |
+
pass
|
217 |
+
|
218 |
+
def __call__(self, sample):
|
219 |
+
image = np.transpose(sample["image"], (2, 0, 1))
|
220 |
+
sample["image"] = np.ascontiguousarray(image).astype(np.float32)
|
221 |
+
|
222 |
+
if "mask" in sample:
|
223 |
+
sample["mask"] = sample["mask"].astype(np.float32)
|
224 |
+
sample["mask"] = np.ascontiguousarray(sample["mask"])
|
225 |
+
|
226 |
+
if "disparity" in sample:
|
227 |
+
disparity = sample["disparity"].astype(np.float32)
|
228 |
+
sample["disparity"] = np.ascontiguousarray(disparity)
|
229 |
+
|
230 |
+
if "depth" in sample:
|
231 |
+
depth = sample["depth"].astype(np.float32)
|
232 |
+
sample["depth"] = np.ascontiguousarray(depth)
|
233 |
+
|
234 |
+
return sample
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/tf/utils.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import sys
|
3 |
+
import cv2
|
4 |
+
|
5 |
+
|
6 |
+
def write_pfm(path, image, scale=1):
|
7 |
+
"""Write pfm file.
|
8 |
+
Args:
|
9 |
+
path (str): pathto file
|
10 |
+
image (array): data
|
11 |
+
scale (int, optional): Scale. Defaults to 1.
|
12 |
+
"""
|
13 |
+
|
14 |
+
with open(path, "wb") as file:
|
15 |
+
color = None
|
16 |
+
|
17 |
+
if image.dtype.name != "float32":
|
18 |
+
raise Exception("Image dtype must be float32.")
|
19 |
+
|
20 |
+
image = np.flipud(image)
|
21 |
+
|
22 |
+
if len(image.shape) == 3 and image.shape[2] == 3: # color image
|
23 |
+
color = True
|
24 |
+
elif (
|
25 |
+
len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1
|
26 |
+
): # greyscale
|
27 |
+
color = False
|
28 |
+
else:
|
29 |
+
raise Exception("Image must have H x W x 3, H x W x 1 or H x W dimensions.")
|
30 |
+
|
31 |
+
file.write("PF\n" if color else "Pf\n".encode())
|
32 |
+
file.write("%d %d\n".encode() % (image.shape[1], image.shape[0]))
|
33 |
+
|
34 |
+
endian = image.dtype.byteorder
|
35 |
+
|
36 |
+
if endian == "<" or endian == "=" and sys.byteorder == "little":
|
37 |
+
scale = -scale
|
38 |
+
|
39 |
+
file.write("%f\n".encode() % scale)
|
40 |
+
|
41 |
+
image.tofile(file)
|
42 |
+
|
43 |
+
def read_image(path):
|
44 |
+
"""Read image and output RGB image (0-1).
|
45 |
+
Args:
|
46 |
+
path (str): path to file
|
47 |
+
Returns:
|
48 |
+
array: RGB image (0-1)
|
49 |
+
"""
|
50 |
+
img = cv2.imread(path)
|
51 |
+
|
52 |
+
if img.ndim == 2:
|
53 |
+
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
|
54 |
+
|
55 |
+
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) / 255.0
|
56 |
+
|
57 |
+
return img
|
58 |
+
|
59 |
+
def write_depth(path, depth, bits=1):
|
60 |
+
"""Write depth map to pfm and png file.
|
61 |
+
Args:
|
62 |
+
path (str): filepath without extension
|
63 |
+
depth (array): depth
|
64 |
+
"""
|
65 |
+
write_pfm(path + ".pfm", depth.astype(np.float32))
|
66 |
+
|
67 |
+
depth_min = depth.min()
|
68 |
+
depth_max = depth.max()
|
69 |
+
|
70 |
+
max_val = (2**(8*bits))-1
|
71 |
+
|
72 |
+
if depth_max - depth_min > np.finfo("float").eps:
|
73 |
+
out = max_val * (depth - depth_min) / (depth_max - depth_min)
|
74 |
+
else:
|
75 |
+
out = 0
|
76 |
+
|
77 |
+
if bits == 1:
|
78 |
+
cv2.imwrite(path + ".png", out.astype("uint8"))
|
79 |
+
elif bits == 2:
|
80 |
+
cv2.imwrite(path + ".png", out.astype("uint16"))
|
81 |
+
|
82 |
+
return
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/utils.py
ADDED
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Utils for monoDepth.
|
2 |
+
"""
|
3 |
+
import sys
|
4 |
+
import re
|
5 |
+
import numpy as np
|
6 |
+
import cv2
|
7 |
+
import torch
|
8 |
+
|
9 |
+
|
10 |
+
def read_pfm(path):
|
11 |
+
"""Read pfm file.
|
12 |
+
|
13 |
+
Args:
|
14 |
+
path (str): path to file
|
15 |
+
|
16 |
+
Returns:
|
17 |
+
tuple: (data, scale)
|
18 |
+
"""
|
19 |
+
with open(path, "rb") as file:
|
20 |
+
|
21 |
+
color = None
|
22 |
+
width = None
|
23 |
+
height = None
|
24 |
+
scale = None
|
25 |
+
endian = None
|
26 |
+
|
27 |
+
header = file.readline().rstrip()
|
28 |
+
if header.decode("ascii") == "PF":
|
29 |
+
color = True
|
30 |
+
elif header.decode("ascii") == "Pf":
|
31 |
+
color = False
|
32 |
+
else:
|
33 |
+
raise Exception("Not a PFM file: " + path)
|
34 |
+
|
35 |
+
dim_match = re.match(r"^(\d+)\s(\d+)\s$", file.readline().decode("ascii"))
|
36 |
+
if dim_match:
|
37 |
+
width, height = list(map(int, dim_match.groups()))
|
38 |
+
else:
|
39 |
+
raise Exception("Malformed PFM header.")
|
40 |
+
|
41 |
+
scale = float(file.readline().decode("ascii").rstrip())
|
42 |
+
if scale < 0:
|
43 |
+
# little-endian
|
44 |
+
endian = "<"
|
45 |
+
scale = -scale
|
46 |
+
else:
|
47 |
+
# big-endian
|
48 |
+
endian = ">"
|
49 |
+
|
50 |
+
data = np.fromfile(file, endian + "f")
|
51 |
+
shape = (height, width, 3) if color else (height, width)
|
52 |
+
|
53 |
+
data = np.reshape(data, shape)
|
54 |
+
data = np.flipud(data)
|
55 |
+
|
56 |
+
return data, scale
|
57 |
+
|
58 |
+
|
59 |
+
def write_pfm(path, image, scale=1):
|
60 |
+
"""Write pfm file.
|
61 |
+
|
62 |
+
Args:
|
63 |
+
path (str): pathto file
|
64 |
+
image (array): data
|
65 |
+
scale (int, optional): Scale. Defaults to 1.
|
66 |
+
"""
|
67 |
+
|
68 |
+
with open(path, "wb") as file:
|
69 |
+
color = None
|
70 |
+
|
71 |
+
if image.dtype.name != "float32":
|
72 |
+
raise Exception("Image dtype must be float32.")
|
73 |
+
|
74 |
+
image = np.flipud(image)
|
75 |
+
|
76 |
+
if len(image.shape) == 3 and image.shape[2] == 3: # color image
|
77 |
+
color = True
|
78 |
+
elif (
|
79 |
+
len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1
|
80 |
+
): # greyscale
|
81 |
+
color = False
|
82 |
+
else:
|
83 |
+
raise Exception("Image must have H x W x 3, H x W x 1 or H x W dimensions.")
|
84 |
+
|
85 |
+
file.write("PF\n" if color else "Pf\n".encode())
|
86 |
+
file.write("%d %d\n".encode() % (image.shape[1], image.shape[0]))
|
87 |
+
|
88 |
+
endian = image.dtype.byteorder
|
89 |
+
|
90 |
+
if endian == "<" or endian == "=" and sys.byteorder == "little":
|
91 |
+
scale = -scale
|
92 |
+
|
93 |
+
file.write("%f\n".encode() % scale)
|
94 |
+
|
95 |
+
image.tofile(file)
|
96 |
+
|
97 |
+
|
98 |
+
def read_image(path):
|
99 |
+
"""Read image and output RGB image (0-1).
|
100 |
+
|
101 |
+
Args:
|
102 |
+
path (str): path to file
|
103 |
+
|
104 |
+
Returns:
|
105 |
+
array: RGB image (0-1)
|
106 |
+
"""
|
107 |
+
img = cv2.imread(path)
|
108 |
+
|
109 |
+
if img.ndim == 2:
|
110 |
+
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
|
111 |
+
|
112 |
+
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) / 255.0
|
113 |
+
|
114 |
+
return img
|
115 |
+
|
116 |
+
|
117 |
+
def resize_image(img):
|
118 |
+
"""Resize image and make it fit for network.
|
119 |
+
|
120 |
+
Args:
|
121 |
+
img (array): image
|
122 |
+
|
123 |
+
Returns:
|
124 |
+
tensor: data ready for network
|
125 |
+
"""
|
126 |
+
height_orig = img.shape[0]
|
127 |
+
width_orig = img.shape[1]
|
128 |
+
|
129 |
+
if width_orig > height_orig:
|
130 |
+
scale = width_orig / 384
|
131 |
+
else:
|
132 |
+
scale = height_orig / 384
|
133 |
+
|
134 |
+
height = (np.ceil(height_orig / scale / 32) * 32).astype(int)
|
135 |
+
width = (np.ceil(width_orig / scale / 32) * 32).astype(int)
|
136 |
+
|
137 |
+
img_resized = cv2.resize(img, (width, height), interpolation=cv2.INTER_AREA)
|
138 |
+
|
139 |
+
img_resized = (
|
140 |
+
torch.from_numpy(np.transpose(img_resized, (2, 0, 1))).contiguous().float()
|
141 |
+
)
|
142 |
+
img_resized = img_resized.unsqueeze(0)
|
143 |
+
|
144 |
+
return img_resized
|
145 |
+
|
146 |
+
|
147 |
+
def resize_depth(depth, width, height):
|
148 |
+
"""Resize depth map and bring to CPU (numpy).
|
149 |
+
|
150 |
+
Args:
|
151 |
+
depth (tensor): depth
|
152 |
+
width (int): image width
|
153 |
+
height (int): image height
|
154 |
+
|
155 |
+
Returns:
|
156 |
+
array: processed depth
|
157 |
+
"""
|
158 |
+
depth = torch.squeeze(depth[0, :, :, :]).to("cpu")
|
159 |
+
|
160 |
+
depth_resized = cv2.resize(
|
161 |
+
depth.numpy(), (width, height), interpolation=cv2.INTER_CUBIC
|
162 |
+
)
|
163 |
+
|
164 |
+
return depth_resized
|
165 |
+
|
166 |
+
def write_depth(path, depth, grayscale, bits=1):
|
167 |
+
"""Write depth map to png file.
|
168 |
+
|
169 |
+
Args:
|
170 |
+
path (str): filepath without extension
|
171 |
+
depth (array): depth
|
172 |
+
grayscale (bool): use a grayscale colormap?
|
173 |
+
"""
|
174 |
+
if not grayscale:
|
175 |
+
bits = 1
|
176 |
+
|
177 |
+
if not np.isfinite(depth).all():
|
178 |
+
depth=np.nan_to_num(depth, nan=0.0, posinf=0.0, neginf=0.0)
|
179 |
+
print("WARNING: Non-finite depth values present")
|
180 |
+
|
181 |
+
depth_min = depth.min()
|
182 |
+
depth_max = depth.max()
|
183 |
+
|
184 |
+
max_val = (2**(8*bits))-1
|
185 |
+
|
186 |
+
if depth_max - depth_min > np.finfo("float").eps:
|
187 |
+
out = max_val * (depth - depth_min) / (depth_max - depth_min)
|
188 |
+
else:
|
189 |
+
out = np.zeros(depth.shape, dtype=depth.dtype)
|
190 |
+
|
191 |
+
if not grayscale:
|
192 |
+
out = cv2.applyColorMap(np.uint8(out), cv2.COLORMAP_INFERNO)
|
193 |
+
|
194 |
+
if bits == 1:
|
195 |
+
cv2.imwrite(path + ".png", out.astype("uint8"))
|
196 |
+
elif bits == 2:
|
197 |
+
cv2.imwrite(path + ".png", out.astype("uint16"))
|
198 |
+
|
199 |
+
return
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/base_models/midas_repo/weights/.placeholder
ADDED
File without changes
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/builder.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
|
3 |
+
# Copyright (c) 2022 Intelligent Systems Lab Org
|
4 |
+
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
# The above copyright notice and this permission notice shall be included in all
|
13 |
+
# copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
22 |
+
|
23 |
+
# File author: Shariq Farooq Bhat
|
24 |
+
|
25 |
+
from importlib import import_module
|
26 |
+
from .depth_model import DepthModel
|
27 |
+
|
28 |
+
def build_model(config) -> DepthModel:
|
29 |
+
"""Builds a model from a config. The model is specified by the model name and version in the config. The model is then constructed using the build_from_config function of the model interface.
|
30 |
+
This function should be used to construct models for training and evaluation.
|
31 |
+
|
32 |
+
Args:
|
33 |
+
config (dict): Config dict. Config is constructed in utils/config.py. Each model has its own config file(s) saved in its root model folder.
|
34 |
+
|
35 |
+
Returns:
|
36 |
+
torch.nn.Module: Model corresponding to name and version as specified in config
|
37 |
+
"""
|
38 |
+
module_name = f"zoedepth.models.{config.model}"
|
39 |
+
try:
|
40 |
+
module = import_module(module_name)
|
41 |
+
except ModuleNotFoundError as e:
|
42 |
+
# print the original error message
|
43 |
+
print(e)
|
44 |
+
raise ValueError(
|
45 |
+
f"Model {config.model} not found. Refer above error for details.") from e
|
46 |
+
try:
|
47 |
+
get_version = getattr(module, "get_version")
|
48 |
+
except AttributeError as e:
|
49 |
+
raise ValueError(
|
50 |
+
f"Model {config.model} has no get_version function.") from e
|
51 |
+
return get_version(config.version_name).build_from_config(config)
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/depth_model.py
ADDED
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
|
3 |
+
# Copyright (c) 2022 Intelligent Systems Lab Org
|
4 |
+
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
# The above copyright notice and this permission notice shall be included in all
|
13 |
+
# copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
22 |
+
|
23 |
+
# File author: Shariq Farooq Bhat
|
24 |
+
|
25 |
+
import numpy as np
|
26 |
+
import torch
|
27 |
+
import torch.nn as nn
|
28 |
+
import torch.nn.functional as F
|
29 |
+
from torchvision import transforms
|
30 |
+
import PIL.Image
|
31 |
+
from PIL import Image
|
32 |
+
from typing import Union
|
33 |
+
|
34 |
+
|
35 |
+
class DepthModel(nn.Module):
|
36 |
+
def __init__(self):
|
37 |
+
super().__init__()
|
38 |
+
self.device = 'cpu'
|
39 |
+
|
40 |
+
def to(self, device) -> nn.Module:
|
41 |
+
self.device = device
|
42 |
+
return super().to(device)
|
43 |
+
|
44 |
+
def forward(self, x, *args, **kwargs):
|
45 |
+
raise NotImplementedError
|
46 |
+
|
47 |
+
def _infer(self, x: torch.Tensor):
|
48 |
+
"""
|
49 |
+
Inference interface for the model
|
50 |
+
Args:
|
51 |
+
x (torch.Tensor): input tensor of shape (b, c, h, w)
|
52 |
+
Returns:
|
53 |
+
torch.Tensor: output tensor of shape (b, 1, h, w)
|
54 |
+
"""
|
55 |
+
return self(x)['metric_depth']
|
56 |
+
|
57 |
+
def _infer_with_pad_aug(self, x: torch.Tensor, pad_input: bool=True, fh: float=3, fw: float=3, upsampling_mode: str='bicubic', padding_mode="reflect", **kwargs) -> torch.Tensor:
|
58 |
+
"""
|
59 |
+
Inference interface for the model with padding augmentation
|
60 |
+
Padding augmentation fixes the boundary artifacts in the output depth map.
|
61 |
+
Boundary artifacts are sometimes caused by the fact that the model is trained on NYU raw dataset which has a black or white border around the image.
|
62 |
+
This augmentation pads the input image and crops the prediction back to the original size / view.
|
63 |
+
|
64 |
+
Note: This augmentation is not required for the models trained with 'avoid_boundary'=True.
|
65 |
+
Args:
|
66 |
+
x (torch.Tensor): input tensor of shape (b, c, h, w)
|
67 |
+
pad_input (bool, optional): whether to pad the input or not. Defaults to True.
|
68 |
+
fh (float, optional): height padding factor. The padding is calculated as sqrt(h/2) * fh. Defaults to 3.
|
69 |
+
fw (float, optional): width padding factor. The padding is calculated as sqrt(w/2) * fw. Defaults to 3.
|
70 |
+
upsampling_mode (str, optional): upsampling mode. Defaults to 'bicubic'.
|
71 |
+
padding_mode (str, optional): padding mode. Defaults to "reflect".
|
72 |
+
Returns:
|
73 |
+
torch.Tensor: output tensor of shape (b, 1, h, w)
|
74 |
+
"""
|
75 |
+
# assert x is nchw and c = 3
|
76 |
+
assert x.dim() == 4, "x must be 4 dimensional, got {}".format(x.dim())
|
77 |
+
assert x.shape[1] == 3, "x must have 3 channels, got {}".format(x.shape[1])
|
78 |
+
|
79 |
+
if pad_input:
|
80 |
+
assert fh > 0 or fw > 0, "atlease one of fh and fw must be greater than 0"
|
81 |
+
pad_h = int(np.sqrt(x.shape[2]/2) * fh)
|
82 |
+
pad_w = int(np.sqrt(x.shape[3]/2) * fw)
|
83 |
+
padding = [pad_w, pad_w]
|
84 |
+
if pad_h > 0:
|
85 |
+
padding += [pad_h, pad_h]
|
86 |
+
|
87 |
+
x = F.pad(x, padding, mode=padding_mode, **kwargs)
|
88 |
+
out = self._infer(x)
|
89 |
+
if out.shape[-2:] != x.shape[-2:]:
|
90 |
+
out = F.interpolate(out, size=(x.shape[2], x.shape[3]), mode=upsampling_mode, align_corners=False)
|
91 |
+
if pad_input:
|
92 |
+
# crop to the original size, handling the case where pad_h and pad_w is 0
|
93 |
+
if pad_h > 0:
|
94 |
+
out = out[:, :, pad_h:-pad_h,:]
|
95 |
+
if pad_w > 0:
|
96 |
+
out = out[:, :, :, pad_w:-pad_w]
|
97 |
+
return out
|
98 |
+
|
99 |
+
def infer_with_flip_aug(self, x, pad_input: bool=True, **kwargs) -> torch.Tensor:
|
100 |
+
"""
|
101 |
+
Inference interface for the model with horizontal flip augmentation
|
102 |
+
Horizontal flip augmentation improves the accuracy of the model by averaging the output of the model with and without horizontal flip.
|
103 |
+
Args:
|
104 |
+
x (torch.Tensor): input tensor of shape (b, c, h, w)
|
105 |
+
pad_input (bool, optional): whether to use padding augmentation. Defaults to True.
|
106 |
+
Returns:
|
107 |
+
torch.Tensor: output tensor of shape (b, 1, h, w)
|
108 |
+
"""
|
109 |
+
# infer with horizontal flip and average
|
110 |
+
out = self._infer_with_pad_aug(x, pad_input=pad_input, **kwargs)
|
111 |
+
out_flip = self._infer_with_pad_aug(torch.flip(x, dims=[3]), pad_input=pad_input, **kwargs)
|
112 |
+
out = (out + torch.flip(out_flip, dims=[3])) / 2
|
113 |
+
return out
|
114 |
+
|
115 |
+
def infer(self, x, pad_input: bool=True, with_flip_aug: bool=True, **kwargs) -> torch.Tensor:
|
116 |
+
"""
|
117 |
+
Inference interface for the model
|
118 |
+
Args:
|
119 |
+
x (torch.Tensor): input tensor of shape (b, c, h, w)
|
120 |
+
pad_input (bool, optional): whether to use padding augmentation. Defaults to True.
|
121 |
+
with_flip_aug (bool, optional): whether to use horizontal flip augmentation. Defaults to True.
|
122 |
+
Returns:
|
123 |
+
torch.Tensor: output tensor of shape (b, 1, h, w)
|
124 |
+
"""
|
125 |
+
if with_flip_aug:
|
126 |
+
return self.infer_with_flip_aug(x, pad_input=pad_input, **kwargs)
|
127 |
+
else:
|
128 |
+
return self._infer_with_pad_aug(x, pad_input=pad_input, **kwargs)
|
129 |
+
|
130 |
+
@torch.no_grad()
|
131 |
+
def infer_pil(self, pil_img, pad_input: bool=True, with_flip_aug: bool=True, output_type: str="numpy", **kwargs) -> Union[np.ndarray, PIL.Image.Image, torch.Tensor]:
|
132 |
+
"""
|
133 |
+
Inference interface for the model for PIL image
|
134 |
+
Args:
|
135 |
+
pil_img (PIL.Image.Image): input PIL image
|
136 |
+
pad_input (bool, optional): whether to use padding augmentation. Defaults to True.
|
137 |
+
with_flip_aug (bool, optional): whether to use horizontal flip augmentation. Defaults to True.
|
138 |
+
output_type (str, optional): output type. Supported values are 'numpy', 'pil' and 'tensor'. Defaults to "numpy".
|
139 |
+
"""
|
140 |
+
x = transforms.ToTensor()(pil_img).unsqueeze(0).to(self.device)
|
141 |
+
out_tensor = self.infer(x, pad_input=pad_input, with_flip_aug=with_flip_aug, **kwargs)
|
142 |
+
if output_type == "numpy":
|
143 |
+
return out_tensor.squeeze().cpu().numpy()
|
144 |
+
elif output_type == "pil":
|
145 |
+
# uint16 is required for depth pil image
|
146 |
+
out_16bit_numpy = (out_tensor.squeeze().cpu().numpy()*256).astype(np.uint16)
|
147 |
+
return Image.fromarray(out_16bit_numpy)
|
148 |
+
elif output_type == "tensor":
|
149 |
+
return out_tensor.squeeze().cpu()
|
150 |
+
else:
|
151 |
+
raise ValueError(f"output_type {output_type} not supported. Supported values are 'numpy', 'pil' and 'tensor'")
|
152 |
+
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/layers/attractor.py
ADDED
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
|
3 |
+
# Copyright (c) 2022 Intelligent Systems Lab Org
|
4 |
+
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
# The above copyright notice and this permission notice shall be included in all
|
13 |
+
# copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
22 |
+
|
23 |
+
# File author: Shariq Farooq Bhat
|
24 |
+
|
25 |
+
import torch
|
26 |
+
import torch.nn as nn
|
27 |
+
|
28 |
+
|
29 |
+
@torch.jit.script
|
30 |
+
def exp_attractor(dx, alpha: float = 300, gamma: int = 2):
|
31 |
+
"""Exponential attractor: dc = exp(-alpha*|dx|^gamma) * dx , where dx = a - c, a = attractor point, c = bin center, dc = shift in bin centermmary for exp_attractor
|
32 |
+
|
33 |
+
Args:
|
34 |
+
dx (torch.Tensor): The difference tensor dx = Ai - Cj, where Ai is the attractor point and Cj is the bin center.
|
35 |
+
alpha (float, optional): Proportional Attractor strength. Determines the absolute strength. Lower alpha = greater attraction. Defaults to 300.
|
36 |
+
gamma (int, optional): Exponential Attractor strength. Determines the "region of influence" and indirectly number of bin centers affected. Lower gamma = farther reach. Defaults to 2.
|
37 |
+
|
38 |
+
Returns:
|
39 |
+
torch.Tensor : Delta shifts - dc; New bin centers = Old bin centers + dc
|
40 |
+
"""
|
41 |
+
return torch.exp(-alpha*(torch.abs(dx)**gamma)) * (dx)
|
42 |
+
|
43 |
+
|
44 |
+
@torch.jit.script
|
45 |
+
def inv_attractor(dx, alpha: float = 300, gamma: int = 2):
|
46 |
+
"""Inverse attractor: dc = dx / (1 + alpha*dx^gamma), where dx = a - c, a = attractor point, c = bin center, dc = shift in bin center
|
47 |
+
This is the default one according to the accompanying paper.
|
48 |
+
|
49 |
+
Args:
|
50 |
+
dx (torch.Tensor): The difference tensor dx = Ai - Cj, where Ai is the attractor point and Cj is the bin center.
|
51 |
+
alpha (float, optional): Proportional Attractor strength. Determines the absolute strength. Lower alpha = greater attraction. Defaults to 300.
|
52 |
+
gamma (int, optional): Exponential Attractor strength. Determines the "region of influence" and indirectly number of bin centers affected. Lower gamma = farther reach. Defaults to 2.
|
53 |
+
|
54 |
+
Returns:
|
55 |
+
torch.Tensor: Delta shifts - dc; New bin centers = Old bin centers + dc
|
56 |
+
"""
|
57 |
+
return dx.div(1+alpha*dx.pow(gamma))
|
58 |
+
|
59 |
+
|
60 |
+
class AttractorLayer(nn.Module):
|
61 |
+
def __init__(self, in_features, n_bins, n_attractors=16, mlp_dim=128, min_depth=1e-3, max_depth=10,
|
62 |
+
alpha=300, gamma=2, kind='sum', attractor_type='exp', memory_efficient=False):
|
63 |
+
"""
|
64 |
+
Attractor layer for bin centers. Bin centers are bounded on the interval (min_depth, max_depth)
|
65 |
+
"""
|
66 |
+
super().__init__()
|
67 |
+
|
68 |
+
self.n_attractors = n_attractors
|
69 |
+
self.n_bins = n_bins
|
70 |
+
self.min_depth = min_depth
|
71 |
+
self.max_depth = max_depth
|
72 |
+
self.alpha = alpha
|
73 |
+
self.gamma = gamma
|
74 |
+
self.kind = kind
|
75 |
+
self.attractor_type = attractor_type
|
76 |
+
self.memory_efficient = memory_efficient
|
77 |
+
|
78 |
+
self._net = nn.Sequential(
|
79 |
+
nn.Conv2d(in_features, mlp_dim, 1, 1, 0),
|
80 |
+
nn.ReLU(inplace=True),
|
81 |
+
nn.Conv2d(mlp_dim, n_attractors*2, 1, 1, 0), # x2 for linear norm
|
82 |
+
nn.ReLU(inplace=True)
|
83 |
+
)
|
84 |
+
|
85 |
+
def forward(self, x, b_prev, prev_b_embedding=None, interpolate=True, is_for_query=False):
|
86 |
+
"""
|
87 |
+
Args:
|
88 |
+
x (torch.Tensor) : feature block; shape - n, c, h, w
|
89 |
+
b_prev (torch.Tensor) : previous bin centers normed; shape - n, prev_nbins, h, w
|
90 |
+
|
91 |
+
Returns:
|
92 |
+
tuple(torch.Tensor,torch.Tensor) : new bin centers normed and scaled; shape - n, nbins, h, w
|
93 |
+
"""
|
94 |
+
if prev_b_embedding is not None:
|
95 |
+
if interpolate:
|
96 |
+
prev_b_embedding = nn.functional.interpolate(
|
97 |
+
prev_b_embedding, x.shape[-2:], mode='bilinear', align_corners=True)
|
98 |
+
x = x + prev_b_embedding
|
99 |
+
|
100 |
+
A = self._net(x)
|
101 |
+
eps = 1e-3
|
102 |
+
A = A + eps
|
103 |
+
n, c, h, w = A.shape
|
104 |
+
A = A.view(n, self.n_attractors, 2, h, w)
|
105 |
+
A_normed = A / A.sum(dim=2, keepdim=True) # n, a, 2, h, w
|
106 |
+
A_normed = A[:, :, 0, ...] # n, na, h, w
|
107 |
+
|
108 |
+
b_prev = nn.functional.interpolate(
|
109 |
+
b_prev, (h, w), mode='bilinear', align_corners=True)
|
110 |
+
b_centers = b_prev
|
111 |
+
|
112 |
+
if self.attractor_type == 'exp':
|
113 |
+
dist = exp_attractor
|
114 |
+
else:
|
115 |
+
dist = inv_attractor
|
116 |
+
|
117 |
+
if not self.memory_efficient:
|
118 |
+
func = {'mean': torch.mean, 'sum': torch.sum}[self.kind]
|
119 |
+
# .shape N, nbins, h, w
|
120 |
+
delta_c = func(dist(A_normed.unsqueeze(
|
121 |
+
2) - b_centers.unsqueeze(1)), dim=1)
|
122 |
+
else:
|
123 |
+
delta_c = torch.zeros_like(b_centers, device=b_centers.device)
|
124 |
+
for i in range(self.n_attractors):
|
125 |
+
# .shape N, nbins, h, w
|
126 |
+
delta_c += dist(A_normed[:, i, ...].unsqueeze(1) - b_centers)
|
127 |
+
|
128 |
+
if self.kind == 'mean':
|
129 |
+
delta_c = delta_c / self.n_attractors
|
130 |
+
|
131 |
+
b_new_centers = b_centers + delta_c
|
132 |
+
B_centers = (self.max_depth - self.min_depth) * \
|
133 |
+
b_new_centers + self.min_depth
|
134 |
+
B_centers, _ = torch.sort(B_centers, dim=1)
|
135 |
+
B_centers = torch.clip(B_centers, self.min_depth, self.max_depth)
|
136 |
+
return b_new_centers, B_centers
|
137 |
+
|
138 |
+
|
139 |
+
class AttractorLayerUnnormed(nn.Module):
|
140 |
+
def __init__(self, in_features, n_bins, n_attractors=16, mlp_dim=128, min_depth=1e-3, max_depth=10,
|
141 |
+
alpha=300, gamma=2, kind='sum', attractor_type='exp', memory_efficient=False):
|
142 |
+
"""
|
143 |
+
Attractor layer for bin centers. Bin centers are unbounded
|
144 |
+
"""
|
145 |
+
super().__init__()
|
146 |
+
|
147 |
+
self.n_attractors = n_attractors
|
148 |
+
self.n_bins = n_bins
|
149 |
+
self.min_depth = min_depth
|
150 |
+
self.max_depth = max_depth
|
151 |
+
self.alpha = alpha
|
152 |
+
self.gamma = gamma
|
153 |
+
self.kind = kind
|
154 |
+
self.attractor_type = attractor_type
|
155 |
+
self.memory_efficient = memory_efficient
|
156 |
+
|
157 |
+
self._net = nn.Sequential(
|
158 |
+
nn.Conv2d(in_features, mlp_dim, 1, 1, 0),
|
159 |
+
nn.ReLU(inplace=True),
|
160 |
+
nn.Conv2d(mlp_dim, n_attractors, 1, 1, 0),
|
161 |
+
nn.Softplus()
|
162 |
+
)
|
163 |
+
|
164 |
+
def forward(self, x, b_prev, prev_b_embedding=None, interpolate=True, is_for_query=False):
|
165 |
+
"""
|
166 |
+
Args:
|
167 |
+
x (torch.Tensor) : feature block; shape - n, c, h, w
|
168 |
+
b_prev (torch.Tensor) : previous bin centers normed; shape - n, prev_nbins, h, w
|
169 |
+
|
170 |
+
Returns:
|
171 |
+
tuple(torch.Tensor,torch.Tensor) : new bin centers unbounded; shape - n, nbins, h, w. Two outputs just to keep the API consistent with the normed version
|
172 |
+
"""
|
173 |
+
if prev_b_embedding is not None:
|
174 |
+
if interpolate:
|
175 |
+
prev_b_embedding = nn.functional.interpolate(
|
176 |
+
prev_b_embedding, x.shape[-2:], mode='bilinear', align_corners=True)
|
177 |
+
x = x + prev_b_embedding
|
178 |
+
|
179 |
+
A = self._net(x)
|
180 |
+
n, c, h, w = A.shape
|
181 |
+
|
182 |
+
b_prev = nn.functional.interpolate(
|
183 |
+
b_prev, (h, w), mode='bilinear', align_corners=True)
|
184 |
+
b_centers = b_prev
|
185 |
+
|
186 |
+
if self.attractor_type == 'exp':
|
187 |
+
dist = exp_attractor
|
188 |
+
else:
|
189 |
+
dist = inv_attractor
|
190 |
+
|
191 |
+
if not self.memory_efficient:
|
192 |
+
func = {'mean': torch.mean, 'sum': torch.sum}[self.kind]
|
193 |
+
# .shape N, nbins, h, w
|
194 |
+
delta_c = func(
|
195 |
+
dist(A.unsqueeze(2) - b_centers.unsqueeze(1)), dim=1)
|
196 |
+
else:
|
197 |
+
delta_c = torch.zeros_like(b_centers, device=b_centers.device)
|
198 |
+
for i in range(self.n_attractors):
|
199 |
+
delta_c += dist(A[:, i, ...].unsqueeze(1) -
|
200 |
+
b_centers) # .shape N, nbins, h, w
|
201 |
+
|
202 |
+
if self.kind == 'mean':
|
203 |
+
delta_c = delta_c / self.n_attractors
|
204 |
+
|
205 |
+
b_new_centers = b_centers + delta_c
|
206 |
+
B_centers = b_new_centers
|
207 |
+
|
208 |
+
return b_new_centers, B_centers
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/layers/dist_layers.py
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
|
3 |
+
# Copyright (c) 2022 Intelligent Systems Lab Org
|
4 |
+
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
# The above copyright notice and this permission notice shall be included in all
|
13 |
+
# copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
22 |
+
|
23 |
+
# File author: Shariq Farooq Bhat
|
24 |
+
|
25 |
+
import torch
|
26 |
+
import torch.nn as nn
|
27 |
+
|
28 |
+
|
29 |
+
def log_binom(n, k, eps=1e-7):
|
30 |
+
""" log(nCk) using stirling approximation """
|
31 |
+
n = n + eps
|
32 |
+
k = k + eps
|
33 |
+
return n * torch.log(n) - k * torch.log(k) - (n-k) * torch.log(n-k+eps)
|
34 |
+
|
35 |
+
|
36 |
+
class LogBinomial(nn.Module):
|
37 |
+
def __init__(self, n_classes=256, act=torch.softmax):
|
38 |
+
"""Compute log binomial distribution for n_classes
|
39 |
+
|
40 |
+
Args:
|
41 |
+
n_classes (int, optional): number of output classes. Defaults to 256.
|
42 |
+
"""
|
43 |
+
super().__init__()
|
44 |
+
self.K = n_classes
|
45 |
+
self.act = act
|
46 |
+
self.register_buffer('k_idx', torch.arange(
|
47 |
+
0, n_classes).view(1, -1, 1, 1))
|
48 |
+
self.register_buffer('K_minus_1', torch.Tensor(
|
49 |
+
[self.K-1]).view(1, -1, 1, 1))
|
50 |
+
|
51 |
+
def forward(self, x, t=1., eps=1e-4):
|
52 |
+
"""Compute log binomial distribution for x
|
53 |
+
|
54 |
+
Args:
|
55 |
+
x (torch.Tensor - NCHW): probabilities
|
56 |
+
t (float, torch.Tensor - NCHW, optional): Temperature of distribution. Defaults to 1..
|
57 |
+
eps (float, optional): Small number for numerical stability. Defaults to 1e-4.
|
58 |
+
|
59 |
+
Returns:
|
60 |
+
torch.Tensor -NCHW: log binomial distribution logbinomial(p;t)
|
61 |
+
"""
|
62 |
+
if x.ndim == 3:
|
63 |
+
x = x.unsqueeze(1) # make it nchw
|
64 |
+
|
65 |
+
one_minus_x = torch.clamp(1 - x, eps, 1)
|
66 |
+
x = torch.clamp(x, eps, 1)
|
67 |
+
y = log_binom(self.K_minus_1, self.k_idx) + self.k_idx * \
|
68 |
+
torch.log(x) + (self.K - 1 - self.k_idx) * torch.log(one_minus_x)
|
69 |
+
return self.act(y/t, dim=1)
|
70 |
+
|
71 |
+
|
72 |
+
class ConditionalLogBinomial(nn.Module):
|
73 |
+
def __init__(self, in_features, condition_dim, n_classes=256, bottleneck_factor=2, p_eps=1e-4, max_temp=50, min_temp=1e-7, act=torch.softmax):
|
74 |
+
"""Conditional Log Binomial distribution
|
75 |
+
|
76 |
+
Args:
|
77 |
+
in_features (int): number of input channels in main feature
|
78 |
+
condition_dim (int): number of input channels in condition feature
|
79 |
+
n_classes (int, optional): Number of classes. Defaults to 256.
|
80 |
+
bottleneck_factor (int, optional): Hidden dim factor. Defaults to 2.
|
81 |
+
p_eps (float, optional): small eps value. Defaults to 1e-4.
|
82 |
+
max_temp (float, optional): Maximum temperature of output distribution. Defaults to 50.
|
83 |
+
min_temp (float, optional): Minimum temperature of output distribution. Defaults to 1e-7.
|
84 |
+
"""
|
85 |
+
super().__init__()
|
86 |
+
self.p_eps = p_eps
|
87 |
+
self.max_temp = max_temp
|
88 |
+
self.min_temp = min_temp
|
89 |
+
self.log_binomial_transform = LogBinomial(n_classes, act=act)
|
90 |
+
bottleneck = (in_features + condition_dim) // bottleneck_factor
|
91 |
+
self.mlp = nn.Sequential(
|
92 |
+
nn.Conv2d(in_features + condition_dim, bottleneck,
|
93 |
+
kernel_size=1, stride=1, padding=0),
|
94 |
+
nn.GELU(),
|
95 |
+
# 2 for p linear norm, 2 for t linear norm
|
96 |
+
nn.Conv2d(bottleneck, 2+2, kernel_size=1, stride=1, padding=0),
|
97 |
+
nn.Softplus()
|
98 |
+
)
|
99 |
+
|
100 |
+
def forward(self, x, cond):
|
101 |
+
"""Forward pass
|
102 |
+
|
103 |
+
Args:
|
104 |
+
x (torch.Tensor - NCHW): Main feature
|
105 |
+
cond (torch.Tensor - NCHW): condition feature
|
106 |
+
|
107 |
+
Returns:
|
108 |
+
torch.Tensor: Output log binomial distribution
|
109 |
+
"""
|
110 |
+
pt = self.mlp(torch.concat((x, cond), dim=1))
|
111 |
+
p, t = pt[:, :2, ...], pt[:, 2:, ...]
|
112 |
+
|
113 |
+
p = p + self.p_eps
|
114 |
+
p = p[:, 0, ...] / (p[:, 0, ...] + p[:, 1, ...])
|
115 |
+
|
116 |
+
t = t + self.p_eps
|
117 |
+
t = t[:, 0, ...] / (t[:, 0, ...] + t[:, 1, ...])
|
118 |
+
t = t.unsqueeze(1)
|
119 |
+
t = (self.max_temp - self.min_temp) * t + self.min_temp
|
120 |
+
|
121 |
+
return self.log_binomial_transform(p, t)
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/layers/localbins_layers.py
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
|
3 |
+
# Copyright (c) 2022 Intelligent Systems Lab Org
|
4 |
+
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
# The above copyright notice and this permission notice shall be included in all
|
13 |
+
# copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
22 |
+
|
23 |
+
# File author: Shariq Farooq Bhat
|
24 |
+
|
25 |
+
import torch
|
26 |
+
import torch.nn as nn
|
27 |
+
|
28 |
+
|
29 |
+
class SeedBinRegressor(nn.Module):
|
30 |
+
def __init__(self, in_features, n_bins=16, mlp_dim=256, min_depth=1e-3, max_depth=10):
|
31 |
+
"""Bin center regressor network. Bin centers are bounded on (min_depth, max_depth) interval.
|
32 |
+
|
33 |
+
Args:
|
34 |
+
in_features (int): input channels
|
35 |
+
n_bins (int, optional): Number of bin centers. Defaults to 16.
|
36 |
+
mlp_dim (int, optional): Hidden dimension. Defaults to 256.
|
37 |
+
min_depth (float, optional): Min depth value. Defaults to 1e-3.
|
38 |
+
max_depth (float, optional): Max depth value. Defaults to 10.
|
39 |
+
"""
|
40 |
+
super().__init__()
|
41 |
+
self.version = "1_1"
|
42 |
+
self.min_depth = min_depth
|
43 |
+
self.max_depth = max_depth
|
44 |
+
|
45 |
+
self._net = nn.Sequential(
|
46 |
+
nn.Conv2d(in_features, mlp_dim, 1, 1, 0),
|
47 |
+
nn.ReLU(inplace=True),
|
48 |
+
nn.Conv2d(mlp_dim, n_bins, 1, 1, 0),
|
49 |
+
nn.ReLU(inplace=True)
|
50 |
+
)
|
51 |
+
|
52 |
+
def forward(self, x):
|
53 |
+
"""
|
54 |
+
Returns tensor of bin_width vectors (centers). One vector b for every pixel
|
55 |
+
"""
|
56 |
+
B = self._net(x)
|
57 |
+
eps = 1e-3
|
58 |
+
B = B + eps
|
59 |
+
B_widths_normed = B / B.sum(dim=1, keepdim=True)
|
60 |
+
B_widths = (self.max_depth - self.min_depth) * \
|
61 |
+
B_widths_normed # .shape NCHW
|
62 |
+
# pad has the form (left, right, top, bottom, front, back)
|
63 |
+
B_widths = nn.functional.pad(
|
64 |
+
B_widths, (0, 0, 0, 0, 1, 0), mode='constant', value=self.min_depth)
|
65 |
+
B_edges = torch.cumsum(B_widths, dim=1) # .shape NCHW
|
66 |
+
|
67 |
+
B_centers = 0.5 * (B_edges[:, :-1, ...] + B_edges[:, 1:, ...])
|
68 |
+
return B_widths_normed, B_centers
|
69 |
+
|
70 |
+
|
71 |
+
class SeedBinRegressorUnnormed(nn.Module):
|
72 |
+
def __init__(self, in_features, n_bins=16, mlp_dim=256, min_depth=1e-3, max_depth=10):
|
73 |
+
"""Bin center regressor network. Bin centers are unbounded
|
74 |
+
|
75 |
+
Args:
|
76 |
+
in_features (int): input channels
|
77 |
+
n_bins (int, optional): Number of bin centers. Defaults to 16.
|
78 |
+
mlp_dim (int, optional): Hidden dimension. Defaults to 256.
|
79 |
+
min_depth (float, optional): Not used. (for compatibility with SeedBinRegressor)
|
80 |
+
max_depth (float, optional): Not used. (for compatibility with SeedBinRegressor)
|
81 |
+
"""
|
82 |
+
super().__init__()
|
83 |
+
self.version = "1_1"
|
84 |
+
self._net = nn.Sequential(
|
85 |
+
nn.Conv2d(in_features, mlp_dim, 1, 1, 0),
|
86 |
+
nn.ReLU(inplace=True),
|
87 |
+
nn.Conv2d(mlp_dim, n_bins, 1, 1, 0),
|
88 |
+
nn.Softplus()
|
89 |
+
)
|
90 |
+
|
91 |
+
def forward(self, x):
|
92 |
+
"""
|
93 |
+
Returns tensor of bin_width vectors (centers). One vector b for every pixel
|
94 |
+
"""
|
95 |
+
B_centers = self._net(x)
|
96 |
+
return B_centers, B_centers
|
97 |
+
|
98 |
+
|
99 |
+
class Projector(nn.Module):
|
100 |
+
def __init__(self, in_features, out_features, mlp_dim=128):
|
101 |
+
"""Projector MLP
|
102 |
+
|
103 |
+
Args:
|
104 |
+
in_features (int): input channels
|
105 |
+
out_features (int): output channels
|
106 |
+
mlp_dim (int, optional): hidden dimension. Defaults to 128.
|
107 |
+
"""
|
108 |
+
super().__init__()
|
109 |
+
|
110 |
+
self._net = nn.Sequential(
|
111 |
+
nn.Conv2d(in_features, mlp_dim, 1, 1, 0),
|
112 |
+
nn.ReLU(inplace=True),
|
113 |
+
nn.Conv2d(mlp_dim, out_features, 1, 1, 0),
|
114 |
+
)
|
115 |
+
|
116 |
+
def forward(self, x):
|
117 |
+
return self._net(x)
|
118 |
+
|
119 |
+
|
120 |
+
|
121 |
+
class LinearSplitter(nn.Module):
|
122 |
+
def __init__(self, in_features, prev_nbins, split_factor=2, mlp_dim=128, min_depth=1e-3, max_depth=10):
|
123 |
+
super().__init__()
|
124 |
+
|
125 |
+
self.prev_nbins = prev_nbins
|
126 |
+
self.split_factor = split_factor
|
127 |
+
self.min_depth = min_depth
|
128 |
+
self.max_depth = max_depth
|
129 |
+
|
130 |
+
self._net = nn.Sequential(
|
131 |
+
nn.Conv2d(in_features, mlp_dim, 1, 1, 0),
|
132 |
+
nn.GELU(),
|
133 |
+
nn.Conv2d(mlp_dim, prev_nbins * split_factor, 1, 1, 0),
|
134 |
+
nn.ReLU()
|
135 |
+
)
|
136 |
+
|
137 |
+
def forward(self, x, b_prev, prev_b_embedding=None, interpolate=True, is_for_query=False):
|
138 |
+
"""
|
139 |
+
x : feature block; shape - n, c, h, w
|
140 |
+
b_prev : previous bin widths normed; shape - n, prev_nbins, h, w
|
141 |
+
"""
|
142 |
+
if prev_b_embedding is not None:
|
143 |
+
if interpolate:
|
144 |
+
prev_b_embedding = nn.functional.interpolate(prev_b_embedding, x.shape[-2:], mode='bilinear', align_corners=True)
|
145 |
+
x = x + prev_b_embedding
|
146 |
+
S = self._net(x)
|
147 |
+
eps = 1e-3
|
148 |
+
S = S + eps
|
149 |
+
n, c, h, w = S.shape
|
150 |
+
S = S.view(n, self.prev_nbins, self.split_factor, h, w)
|
151 |
+
S_normed = S / S.sum(dim=2, keepdim=True) # fractional splits
|
152 |
+
|
153 |
+
b_prev = nn.functional.interpolate(b_prev, (h,w), mode='bilinear', align_corners=True)
|
154 |
+
|
155 |
+
|
156 |
+
b_prev = b_prev / b_prev.sum(dim=1, keepdim=True) # renormalize for gurantees
|
157 |
+
# print(b_prev.shape, S_normed.shape)
|
158 |
+
# if is_for_query:(1).expand(-1, b_prev.size(0)//n, -1, -1, -1, -1).flatten(0,1) # TODO ? can replace all this with a single torch.repeat?
|
159 |
+
b = b_prev.unsqueeze(2) * S_normed
|
160 |
+
b = b.flatten(1,2) # .shape n, prev_nbins * split_factor, h, w
|
161 |
+
|
162 |
+
# calculate bin centers for loss calculation
|
163 |
+
B_widths = (self.max_depth - self.min_depth) * b # .shape N, nprev * splitfactor, H, W
|
164 |
+
# pad has the form (left, right, top, bottom, front, back)
|
165 |
+
B_widths = nn.functional.pad(B_widths, (0,0,0,0,1,0), mode='constant', value=self.min_depth)
|
166 |
+
B_edges = torch.cumsum(B_widths, dim=1) # .shape NCHW
|
167 |
+
|
168 |
+
B_centers = 0.5 * (B_edges[:, :-1, ...] + B_edges[:,1:,...])
|
169 |
+
return b, B_centers
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/layers/patch_transformer.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
|
3 |
+
# Copyright (c) 2022 Intelligent Systems Lab Org
|
4 |
+
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
# The above copyright notice and this permission notice shall be included in all
|
13 |
+
# copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
22 |
+
|
23 |
+
# File author: Shariq Farooq Bhat
|
24 |
+
|
25 |
+
import torch
|
26 |
+
import torch.nn as nn
|
27 |
+
|
28 |
+
|
29 |
+
class PatchTransformerEncoder(nn.Module):
|
30 |
+
def __init__(self, in_channels, patch_size=10, embedding_dim=128, num_heads=4, use_class_token=False):
|
31 |
+
"""ViT-like transformer block
|
32 |
+
|
33 |
+
Args:
|
34 |
+
in_channels (int): Input channels
|
35 |
+
patch_size (int, optional): patch size. Defaults to 10.
|
36 |
+
embedding_dim (int, optional): Embedding dimension in transformer model. Defaults to 128.
|
37 |
+
num_heads (int, optional): number of attention heads. Defaults to 4.
|
38 |
+
use_class_token (bool, optional): Whether to use extra token at the start for global accumulation (called as "class token"). Defaults to False.
|
39 |
+
"""
|
40 |
+
super(PatchTransformerEncoder, self).__init__()
|
41 |
+
self.use_class_token = use_class_token
|
42 |
+
encoder_layers = nn.TransformerEncoderLayer(
|
43 |
+
embedding_dim, num_heads, dim_feedforward=1024)
|
44 |
+
self.transformer_encoder = nn.TransformerEncoder(
|
45 |
+
encoder_layers, num_layers=4) # takes shape S,N,E
|
46 |
+
|
47 |
+
self.embedding_convPxP = nn.Conv2d(in_channels, embedding_dim,
|
48 |
+
kernel_size=patch_size, stride=patch_size, padding=0)
|
49 |
+
|
50 |
+
def positional_encoding_1d(self, sequence_length, batch_size, embedding_dim, device='cpu'):
|
51 |
+
"""Generate positional encodings
|
52 |
+
|
53 |
+
Args:
|
54 |
+
sequence_length (int): Sequence length
|
55 |
+
embedding_dim (int): Embedding dimension
|
56 |
+
|
57 |
+
Returns:
|
58 |
+
torch.Tensor SBE: Positional encodings
|
59 |
+
"""
|
60 |
+
position = torch.arange(
|
61 |
+
0, sequence_length, dtype=torch.float32, device=device).unsqueeze(1)
|
62 |
+
index = torch.arange(
|
63 |
+
0, embedding_dim, 2, dtype=torch.float32, device=device).unsqueeze(0)
|
64 |
+
div_term = torch.exp(index * (-torch.log(torch.tensor(10000.0, device=device)) / embedding_dim))
|
65 |
+
pos_encoding = position * div_term
|
66 |
+
pos_encoding = torch.cat([torch.sin(pos_encoding), torch.cos(pos_encoding)], dim=1)
|
67 |
+
pos_encoding = pos_encoding.unsqueeze(1).repeat(1, batch_size, 1)
|
68 |
+
return pos_encoding
|
69 |
+
|
70 |
+
|
71 |
+
def forward(self, x):
|
72 |
+
"""Forward pass
|
73 |
+
|
74 |
+
Args:
|
75 |
+
x (torch.Tensor - NCHW): Input feature tensor
|
76 |
+
|
77 |
+
Returns:
|
78 |
+
torch.Tensor - SNE: Transformer output embeddings. S - sequence length (=HW/patch_size^2), N - batch size, E - embedding dim
|
79 |
+
"""
|
80 |
+
embeddings = self.embedding_convPxP(x).flatten(
|
81 |
+
2) # .shape = n,c,s = n, embedding_dim, s
|
82 |
+
if self.use_class_token:
|
83 |
+
# extra special token at start ?
|
84 |
+
embeddings = nn.functional.pad(embeddings, (1, 0))
|
85 |
+
|
86 |
+
# change to S,N,E format required by transformer
|
87 |
+
embeddings = embeddings.permute(2, 0, 1)
|
88 |
+
S, N, E = embeddings.shape
|
89 |
+
embeddings = embeddings + self.positional_encoding_1d(S, N, E, device=embeddings.device)
|
90 |
+
x = self.transformer_encoder(embeddings) # .shape = S, N, E
|
91 |
+
return x
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/model_io.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
|
3 |
+
# Copyright (c) 2022 Intelligent Systems Lab Org
|
4 |
+
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
# The above copyright notice and this permission notice shall be included in all
|
13 |
+
# copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
22 |
+
|
23 |
+
# File author: Shariq Farooq Bhat
|
24 |
+
|
25 |
+
import torch
|
26 |
+
|
27 |
+
def load_state_dict(model, state_dict):
|
28 |
+
"""Load state_dict into model, handling DataParallel and DistributedDataParallel. Also checks for "model" key in state_dict.
|
29 |
+
|
30 |
+
DataParallel prefixes state_dict keys with 'module.' when saving.
|
31 |
+
If the model is not a DataParallel model but the state_dict is, then prefixes are removed.
|
32 |
+
If the model is a DataParallel model but the state_dict is not, then prefixes are added.
|
33 |
+
"""
|
34 |
+
state_dict = state_dict.get('model', state_dict)
|
35 |
+
# if model is a DataParallel model, then state_dict keys are prefixed with 'module.'
|
36 |
+
|
37 |
+
do_prefix = isinstance(
|
38 |
+
model, (torch.nn.DataParallel, torch.nn.parallel.DistributedDataParallel))
|
39 |
+
state = {}
|
40 |
+
for k, v in state_dict.items():
|
41 |
+
if k.startswith('module.') and not do_prefix:
|
42 |
+
k = k[7:]
|
43 |
+
|
44 |
+
if not k.startswith('module.') and do_prefix:
|
45 |
+
k = 'module.' + k
|
46 |
+
|
47 |
+
state[k] = v
|
48 |
+
|
49 |
+
model.load_state_dict(state)
|
50 |
+
print("Loaded successfully")
|
51 |
+
return model
|
52 |
+
|
53 |
+
|
54 |
+
def load_wts(model, checkpoint_path):
|
55 |
+
ckpt = torch.load(checkpoint_path, map_location='cpu')
|
56 |
+
return load_state_dict(model, ckpt)
|
57 |
+
|
58 |
+
|
59 |
+
def load_state_dict_from_url(model, url, **kwargs):
|
60 |
+
state_dict = torch.hub.load_state_dict_from_url(url, map_location='cpu', **kwargs)
|
61 |
+
return load_state_dict(model, state_dict)
|
62 |
+
|
63 |
+
|
64 |
+
def load_state_from_resource(model, resource: str):
|
65 |
+
"""Loads weights to the model from a given resource. A resource can be of following types:
|
66 |
+
1. URL. Prefixed with "url::"
|
67 |
+
e.g. url::http(s)://url.resource.com/ckpt.pt
|
68 |
+
|
69 |
+
2. Local path. Prefixed with "local::"
|
70 |
+
e.g. local::/path/to/ckpt.pt
|
71 |
+
|
72 |
+
|
73 |
+
Args:
|
74 |
+
model (torch.nn.Module): Model
|
75 |
+
resource (str): resource string
|
76 |
+
|
77 |
+
Returns:
|
78 |
+
torch.nn.Module: Model with loaded weights
|
79 |
+
"""
|
80 |
+
print(f"Using pretrained resource {resource}")
|
81 |
+
|
82 |
+
if resource.startswith('url::'):
|
83 |
+
url = resource.split('url::')[1]
|
84 |
+
return load_state_dict_from_url(model, url, progress=True)
|
85 |
+
|
86 |
+
elif resource.startswith('local::'):
|
87 |
+
path = resource.split('local::')[1]
|
88 |
+
return load_wts(model, path)
|
89 |
+
|
90 |
+
else:
|
91 |
+
raise ValueError("Invalid resource type, only url:: and local:: are supported")
|
92 |
+
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/zoedepth/__init__.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
|
3 |
+
# Copyright (c) 2022 Intelligent Systems Lab Org
|
4 |
+
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
# The above copyright notice and this permission notice shall be included in all
|
13 |
+
# copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
22 |
+
|
23 |
+
# File author: Shariq Farooq Bhat
|
24 |
+
|
25 |
+
from .zoedepth_v1 import ZoeDepth
|
26 |
+
|
27 |
+
all_versions = {
|
28 |
+
"v1": ZoeDepth,
|
29 |
+
}
|
30 |
+
|
31 |
+
get_version = lambda v : all_versions[v]
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/zoedepth/config_zoedepth.json
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": {
|
3 |
+
"name": "ZoeDepth",
|
4 |
+
"version_name": "v1",
|
5 |
+
"n_bins": 64,
|
6 |
+
"bin_embedding_dim": 128,
|
7 |
+
"bin_centers_type": "softplus",
|
8 |
+
"n_attractors":[16, 8, 4, 1],
|
9 |
+
"attractor_alpha": 1000,
|
10 |
+
"attractor_gamma": 2,
|
11 |
+
"attractor_kind" : "mean",
|
12 |
+
"attractor_type" : "inv",
|
13 |
+
"midas_model_type" : "DPT_BEiT_L_384",
|
14 |
+
"min_temp": 0.0212,
|
15 |
+
"max_temp": 50.0,
|
16 |
+
"output_distribution": "logbinomial",
|
17 |
+
"memory_efficient": true,
|
18 |
+
"inverse_midas": false,
|
19 |
+
"img_size": [384, 512]
|
20 |
+
},
|
21 |
+
|
22 |
+
"train": {
|
23 |
+
"train_midas": true,
|
24 |
+
"use_pretrained_midas": true,
|
25 |
+
"trainer": "zoedepth",
|
26 |
+
"epochs": 5,
|
27 |
+
"bs": 16,
|
28 |
+
"optim_kwargs": {"lr": 0.000161, "wd": 0.01},
|
29 |
+
"sched_kwargs": {"div_factor": 1, "final_div_factor": 10000, "pct_start": 0.7, "three_phase":false, "cycle_momentum": true},
|
30 |
+
"same_lr": false,
|
31 |
+
"w_si": 1,
|
32 |
+
"w_domain": 0.2,
|
33 |
+
"w_reg": 0,
|
34 |
+
"w_grad": 0,
|
35 |
+
"avoid_boundary": false,
|
36 |
+
"random_crop": false,
|
37 |
+
"input_width": 640,
|
38 |
+
"input_height": 480,
|
39 |
+
"midas_lr_factor": 1,
|
40 |
+
"encoder_lr_factor":10,
|
41 |
+
"pos_enc_lr_factor":10,
|
42 |
+
"freeze_midas_bn": true
|
43 |
+
|
44 |
+
},
|
45 |
+
|
46 |
+
"infer":{
|
47 |
+
"train_midas": false,
|
48 |
+
"use_pretrained_midas": false,
|
49 |
+
"pretrained_resource" : null,
|
50 |
+
"force_keep_ar": true
|
51 |
+
},
|
52 |
+
|
53 |
+
"eval":{
|
54 |
+
"train_midas": false,
|
55 |
+
"use_pretrained_midas": false,
|
56 |
+
"pretrained_resource" : null
|
57 |
+
}
|
58 |
+
}
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/zoedepth/config_zoedepth_kitti.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": {
|
3 |
+
"bin_centers_type": "normed",
|
4 |
+
"img_size": [384, 768]
|
5 |
+
},
|
6 |
+
|
7 |
+
"train": {
|
8 |
+
},
|
9 |
+
|
10 |
+
"infer":{
|
11 |
+
"train_midas": false,
|
12 |
+
"use_pretrained_midas": false,
|
13 |
+
"pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_K.pt",
|
14 |
+
"force_keep_ar": true
|
15 |
+
},
|
16 |
+
|
17 |
+
"eval":{
|
18 |
+
"train_midas": false,
|
19 |
+
"use_pretrained_midas": false,
|
20 |
+
"pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_K.pt"
|
21 |
+
}
|
22 |
+
}
|
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/zoedepth/zoedepth_v1.py
ADDED
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
|
3 |
+
# Copyright (c) 2022 Intelligent Systems Lab Org
|
4 |
+
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
# The above copyright notice and this permission notice shall be included in all
|
13 |
+
# copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
22 |
+
|
23 |
+
# File author: Shariq Farooq Bhat
|
24 |
+
|
25 |
+
import itertools
|
26 |
+
|
27 |
+
import torch
|
28 |
+
import torch.nn as nn
|
29 |
+
from ..depth_model import DepthModel
|
30 |
+
from ..base_models.midas import MidasCore
|
31 |
+
from ..layers.attractor import AttractorLayer, AttractorLayerUnnormed
|
32 |
+
from ..layers.dist_layers import ConditionalLogBinomial
|
33 |
+
from ..layers.localbins_layers import (Projector, SeedBinRegressor,
|
34 |
+
SeedBinRegressorUnnormed)
|
35 |
+
from ..model_io import load_state_from_resource
|
36 |
+
|
37 |
+
|
38 |
+
class ZoeDepth(DepthModel):
|
39 |
+
def __init__(self, core, n_bins=64, bin_centers_type="softplus", bin_embedding_dim=128, min_depth=1e-3, max_depth=10,
|
40 |
+
n_attractors=[16, 8, 4, 1], attractor_alpha=300, attractor_gamma=2, attractor_kind='sum', attractor_type='exp', min_temp=5, max_temp=50, train_midas=True,
|
41 |
+
midas_lr_factor=10, encoder_lr_factor=10, pos_enc_lr_factor=10, inverse_midas=False, **kwargs):
|
42 |
+
"""ZoeDepth model. This is the version of ZoeDepth that has a single metric head
|
43 |
+
|
44 |
+
Args:
|
45 |
+
core (models.base_models.midas.MidasCore): The base midas model that is used for extraction of "relative" features
|
46 |
+
n_bins (int, optional): Number of bin centers. Defaults to 64.
|
47 |
+
bin_centers_type (str, optional): "normed" or "softplus". Activation type used for bin centers. For "normed" bin centers, linear normalization trick is applied. This results in bounded bin centers.
|
48 |
+
For "softplus", softplus activation is used and thus are unbounded. Defaults to "softplus".
|
49 |
+
bin_embedding_dim (int, optional): bin embedding dimension. Defaults to 128.
|
50 |
+
min_depth (float, optional): Lower bound for normed bin centers. Defaults to 1e-3.
|
51 |
+
max_depth (float, optional): Upper bound for normed bin centers. Defaults to 10.
|
52 |
+
n_attractors (List[int], optional): Number of bin attractors at decoder layers. Defaults to [16, 8, 4, 1].
|
53 |
+
attractor_alpha (int, optional): Proportional attractor strength. Refer to models.layers.attractor for more details. Defaults to 300.
|
54 |
+
attractor_gamma (int, optional): Exponential attractor strength. Refer to models.layers.attractor for more details. Defaults to 2.
|
55 |
+
attractor_kind (str, optional): Attraction aggregation "sum" or "mean". Defaults to 'sum'.
|
56 |
+
attractor_type (str, optional): Type of attractor to use; "inv" (Inverse attractor) or "exp" (Exponential attractor). Defaults to 'exp'.
|
57 |
+
min_temp (int, optional): Lower bound for temperature of output probability distribution. Defaults to 5.
|
58 |
+
max_temp (int, optional): Upper bound for temperature of output probability distribution. Defaults to 50.
|
59 |
+
train_midas (bool, optional): Whether to train "core", the base midas model. Defaults to True.
|
60 |
+
midas_lr_factor (int, optional): Learning rate reduction factor for base midas model except its encoder and positional encodings. Defaults to 10.
|
61 |
+
encoder_lr_factor (int, optional): Learning rate reduction factor for the encoder in midas model. Defaults to 10.
|
62 |
+
pos_enc_lr_factor (int, optional): Learning rate reduction factor for positional encodings in the base midas model. Defaults to 10.
|
63 |
+
"""
|
64 |
+
super().__init__()
|
65 |
+
|
66 |
+
self.core = core
|
67 |
+
self.max_depth = max_depth
|
68 |
+
self.min_depth = min_depth
|
69 |
+
self.min_temp = min_temp
|
70 |
+
self.bin_centers_type = bin_centers_type
|
71 |
+
|
72 |
+
self.midas_lr_factor = midas_lr_factor
|
73 |
+
self.encoder_lr_factor = encoder_lr_factor
|
74 |
+
self.pos_enc_lr_factor = pos_enc_lr_factor
|
75 |
+
self.train_midas = train_midas
|
76 |
+
self.inverse_midas = inverse_midas
|
77 |
+
|
78 |
+
if self.encoder_lr_factor <= 0:
|
79 |
+
self.core.freeze_encoder(
|
80 |
+
freeze_rel_pos=self.pos_enc_lr_factor <= 0)
|
81 |
+
|
82 |
+
N_MIDAS_OUT = 32
|
83 |
+
btlnck_features = self.core.output_channels[0]
|
84 |
+
num_out_features = self.core.output_channels[1:]
|
85 |
+
|
86 |
+
self.conv2 = nn.Conv2d(btlnck_features, btlnck_features,
|
87 |
+
kernel_size=1, stride=1, padding=0) # btlnck conv
|
88 |
+
|
89 |
+
if bin_centers_type == "normed":
|
90 |
+
SeedBinRegressorLayer = SeedBinRegressor
|
91 |
+
Attractor = AttractorLayer
|
92 |
+
elif bin_centers_type == "softplus":
|
93 |
+
SeedBinRegressorLayer = SeedBinRegressorUnnormed
|
94 |
+
Attractor = AttractorLayerUnnormed
|
95 |
+
elif bin_centers_type == "hybrid1":
|
96 |
+
SeedBinRegressorLayer = SeedBinRegressor
|
97 |
+
Attractor = AttractorLayerUnnormed
|
98 |
+
elif bin_centers_type == "hybrid2":
|
99 |
+
SeedBinRegressorLayer = SeedBinRegressorUnnormed
|
100 |
+
Attractor = AttractorLayer
|
101 |
+
else:
|
102 |
+
raise ValueError(
|
103 |
+
"bin_centers_type should be one of 'normed', 'softplus', 'hybrid1', 'hybrid2'")
|
104 |
+
|
105 |
+
self.seed_bin_regressor = SeedBinRegressorLayer(
|
106 |
+
btlnck_features, n_bins=n_bins, min_depth=min_depth, max_depth=max_depth)
|
107 |
+
self.seed_projector = Projector(btlnck_features, bin_embedding_dim)
|
108 |
+
self.projectors = nn.ModuleList([
|
109 |
+
Projector(num_out, bin_embedding_dim)
|
110 |
+
for num_out in num_out_features
|
111 |
+
])
|
112 |
+
self.attractors = nn.ModuleList([
|
113 |
+
Attractor(bin_embedding_dim, n_bins, n_attractors=n_attractors[i], min_depth=min_depth, max_depth=max_depth,
|
114 |
+
alpha=attractor_alpha, gamma=attractor_gamma, kind=attractor_kind, attractor_type=attractor_type)
|
115 |
+
for i in range(len(num_out_features))
|
116 |
+
])
|
117 |
+
|
118 |
+
last_in = N_MIDAS_OUT + 1 # +1 for relative depth
|
119 |
+
|
120 |
+
# use log binomial instead of softmax
|
121 |
+
self.conditional_log_binomial = ConditionalLogBinomial(
|
122 |
+
last_in, bin_embedding_dim, n_classes=n_bins, min_temp=min_temp, max_temp=max_temp)
|
123 |
+
|
124 |
+
def forward(self, x, return_final_centers=False, denorm=False, return_probs=False, **kwargs):
|
125 |
+
"""
|
126 |
+
Args:
|
127 |
+
x (torch.Tensor): Input image tensor of shape (B, C, H, W)
|
128 |
+
return_final_centers (bool, optional): Whether to return the final bin centers. Defaults to False.
|
129 |
+
denorm (bool, optional): Whether to denormalize the input image. This reverses ImageNet normalization as midas normalization is different. Defaults to False.
|
130 |
+
return_probs (bool, optional): Whether to return the output probability distribution. Defaults to False.
|
131 |
+
|
132 |
+
Returns:
|
133 |
+
dict: Dictionary containing the following keys:
|
134 |
+
- rel_depth (torch.Tensor): Relative depth map of shape (B, H, W)
|
135 |
+
- metric_depth (torch.Tensor): Metric depth map of shape (B, 1, H, W)
|
136 |
+
- bin_centers (torch.Tensor): Bin centers of shape (B, n_bins). Present only if return_final_centers is True
|
137 |
+
- probs (torch.Tensor): Output probability distribution of shape (B, n_bins, H, W). Present only if return_probs is True
|
138 |
+
|
139 |
+
"""
|
140 |
+
b, c, h, w = x.shape
|
141 |
+
# print("input shape ", x.shape)
|
142 |
+
self.orig_input_width = w
|
143 |
+
self.orig_input_height = h
|
144 |
+
rel_depth, out = self.core(x, denorm=denorm, return_rel_depth=True)
|
145 |
+
# print("output shapes", rel_depth.shape, out.shape)
|
146 |
+
|
147 |
+
outconv_activation = out[0]
|
148 |
+
btlnck = out[1]
|
149 |
+
x_blocks = out[2:]
|
150 |
+
|
151 |
+
x_d0 = self.conv2(btlnck)
|
152 |
+
x = x_d0
|
153 |
+
_, seed_b_centers = self.seed_bin_regressor(x)
|
154 |
+
|
155 |
+
if self.bin_centers_type == 'normed' or self.bin_centers_type == 'hybrid2':
|
156 |
+
b_prev = (seed_b_centers - self.min_depth) / \
|
157 |
+
(self.max_depth - self.min_depth)
|
158 |
+
else:
|
159 |
+
b_prev = seed_b_centers
|
160 |
+
|
161 |
+
prev_b_embedding = self.seed_projector(x)
|
162 |
+
|
163 |
+
# unroll this loop for better performance
|
164 |
+
for projector, attractor, x in zip(self.projectors, self.attractors, x_blocks):
|
165 |
+
b_embedding = projector(x)
|
166 |
+
b, b_centers = attractor(
|
167 |
+
b_embedding, b_prev, prev_b_embedding, interpolate=True)
|
168 |
+
b_prev = b.clone()
|
169 |
+
prev_b_embedding = b_embedding.clone()
|
170 |
+
|
171 |
+
last = outconv_activation
|
172 |
+
|
173 |
+
if self.inverse_midas:
|
174 |
+
# invert depth followed by normalization
|
175 |
+
rel_depth = 1.0 / (rel_depth + 1e-6)
|
176 |
+
rel_depth = (rel_depth - rel_depth.min()) / \
|
177 |
+
(rel_depth.max() - rel_depth.min())
|
178 |
+
# concat rel depth with last. First interpolate rel depth to last size
|
179 |
+
rel_cond = rel_depth.unsqueeze(1)
|
180 |
+
rel_cond = nn.functional.interpolate(
|
181 |
+
rel_cond, size=last.shape[2:], mode='bilinear', align_corners=True)
|
182 |
+
last = torch.cat([last, rel_cond], dim=1)
|
183 |
+
|
184 |
+
b_embedding = nn.functional.interpolate(
|
185 |
+
b_embedding, last.shape[-2:], mode='bilinear', align_corners=True)
|
186 |
+
x = self.conditional_log_binomial(last, b_embedding)
|
187 |
+
|
188 |
+
# Now depth value is Sum px * cx , where cx are bin_centers from the last bin tensor
|
189 |
+
# print(x.shape, b_centers.shape)
|
190 |
+
b_centers = nn.functional.interpolate(
|
191 |
+
b_centers, x.shape[-2:], mode='bilinear', align_corners=True)
|
192 |
+
out = torch.sum(x * b_centers, dim=1, keepdim=True)
|
193 |
+
|
194 |
+
# Structure output dict
|
195 |
+
output = dict(metric_depth=out)
|
196 |
+
if return_final_centers or return_probs:
|
197 |
+
output['bin_centers'] = b_centers
|
198 |
+
|
199 |
+
if return_probs:
|
200 |
+
output['probs'] = x
|
201 |
+
|
202 |
+
return output
|
203 |
+
|
204 |
+
def get_lr_params(self, lr):
|
205 |
+
"""
|
206 |
+
Learning rate configuration for different layers of the model
|
207 |
+
Args:
|
208 |
+
lr (float) : Base learning rate
|
209 |
+
Returns:
|
210 |
+
list : list of parameters to optimize and their learning rates, in the format required by torch optimizers.
|
211 |
+
"""
|
212 |
+
param_conf = []
|
213 |
+
if self.train_midas:
|
214 |
+
if self.encoder_lr_factor > 0:
|
215 |
+
param_conf.append({'params': self.core.get_enc_params_except_rel_pos(
|
216 |
+
), 'lr': lr / self.encoder_lr_factor})
|
217 |
+
|
218 |
+
if self.pos_enc_lr_factor > 0:
|
219 |
+
param_conf.append(
|
220 |
+
{'params': self.core.get_rel_pos_params(), 'lr': lr / self.pos_enc_lr_factor})
|
221 |
+
|
222 |
+
midas_params = self.core.core.scratch.parameters()
|
223 |
+
midas_lr_factor = self.midas_lr_factor
|
224 |
+
param_conf.append(
|
225 |
+
{'params': midas_params, 'lr': lr / midas_lr_factor})
|
226 |
+
|
227 |
+
remaining_modules = []
|
228 |
+
for name, child in self.named_children():
|
229 |
+
if name != 'core':
|
230 |
+
remaining_modules.append(child)
|
231 |
+
remaining_params = itertools.chain(
|
232 |
+
*[child.parameters() for child in remaining_modules])
|
233 |
+
|
234 |
+
param_conf.append({'params': remaining_params, 'lr': lr})
|
235 |
+
|
236 |
+
return param_conf
|
237 |
+
|
238 |
+
@staticmethod
|
239 |
+
def build(midas_model_type="DPT_BEiT_L_384", pretrained_resource=None, use_pretrained_midas=False, train_midas=False, freeze_midas_bn=True, **kwargs):
|
240 |
+
core = MidasCore.build(midas_model_type=midas_model_type, use_pretrained_midas=use_pretrained_midas,
|
241 |
+
train_midas=train_midas, fetch_features=True, freeze_bn=freeze_midas_bn, **kwargs)
|
242 |
+
model = ZoeDepth(core, **kwargs)
|
243 |
+
if pretrained_resource:
|
244 |
+
assert isinstance(pretrained_resource, str), "pretrained_resource must be a string"
|
245 |
+
model = load_state_from_resource(model, pretrained_resource)
|
246 |
+
return model
|
247 |
+
|
248 |
+
@staticmethod
|
249 |
+
def build_from_config(config):
|
250 |
+
return ZoeDepth.build(**config)
|