toto10 commited on
Commit
ee4ceae
1 Parent(s): 47be066

27e42d28c71811068f519a9ffbc46cb633703ba3422ea495312037611c462237

Browse files
Files changed (33) hide show
  1. extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/zoedepth_nk/__init__.py +31 -0
  2. extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/zoedepth_nk/config_zoedepth_nk.json +67 -0
  3. extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/zoedepth_nk/zoedepth_nk_v1.py +333 -0
  4. extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/utils/__init__.py +24 -0
  5. extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/utils/arg_utils.py +33 -0
  6. extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/utils/config.py +437 -0
  7. extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/utils/easydict/__init__.py +158 -0
  8. extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/utils/geometry.py +98 -0
  9. extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/utils/misc.py +368 -0
  10. extensions/microsoftexcel-controlnet/example/api_img2img.ipynb +105 -0
  11. extensions/microsoftexcel-controlnet/example/api_txt2img.ipynb +104 -0
  12. extensions/microsoftexcel-controlnet/example/chatgpt.py +676 -0
  13. extensions/microsoftexcel-controlnet/example/visual_chatgpt.ipynb +60 -0
  14. extensions/microsoftexcel-controlnet/extract_controlnet.py +27 -0
  15. extensions/microsoftexcel-controlnet/extract_controlnet_diff.py +91 -0
  16. extensions/microsoftexcel-controlnet/install.py +20 -0
  17. extensions/microsoftexcel-controlnet/javascript/hints.js +17 -0
  18. extensions/microsoftexcel-controlnet/models/cldm_v15.yaml +79 -0
  19. extensions/microsoftexcel-controlnet/models/cldm_v21.yaml +85 -0
  20. extensions/microsoftexcel-controlnet/models/control_sd15_canny.yaml +79 -0
  21. extensions/microsoftexcel-controlnet/models/control_sd15_depth.yaml +79 -0
  22. extensions/microsoftexcel-controlnet/models/control_sd15_hed.yaml +79 -0
  23. extensions/microsoftexcel-controlnet/models/control_sd15_mlsd.yaml +79 -0
  24. extensions/microsoftexcel-controlnet/models/control_sd15_normal.yaml +79 -0
  25. extensions/microsoftexcel-controlnet/models/control_sd15_openpose.yaml +79 -0
  26. extensions/microsoftexcel-controlnet/models/control_sd15_scribble.yaml +79 -0
  27. extensions/microsoftexcel-controlnet/models/control_sd15_seg.yaml +79 -0
  28. extensions/microsoftexcel-controlnet/models/control_v11e_sd15_ip2p.yaml +79 -0
  29. extensions/microsoftexcel-controlnet/models/control_v11e_sd15_shuffle.yaml +80 -0
  30. extensions/microsoftexcel-controlnet/models/control_v11f1e_sd15_tile.safetensors +3 -0
  31. extensions/microsoftexcel-controlnet/models/control_v11f1e_sd15_tile.yaml +79 -0
  32. extensions/microsoftexcel-controlnet/models/control_v11f1p_sd15_depth.safetensors +3 -0
  33. extensions/microsoftexcel-controlnet/models/control_v11f1p_sd15_depth.yaml +79 -0
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/zoedepth_nk/__init__.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+
3
+ # Copyright (c) 2022 Intelligent Systems Lab Org
4
+
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+
12
+ # The above copyright notice and this permission notice shall be included in all
13
+ # copies or substantial portions of the Software.
14
+
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ # File author: Shariq Farooq Bhat
24
+
25
+ from .zoedepth_nk_v1 import ZoeDepthNK
26
+
27
+ all_versions = {
28
+ "v1": ZoeDepthNK,
29
+ }
30
+
31
+ get_version = lambda v : all_versions[v]
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/zoedepth_nk/config_zoedepth_nk.json ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": {
3
+ "name": "ZoeDepthNK",
4
+ "version_name": "v1",
5
+ "bin_conf" : [
6
+ {
7
+ "name": "nyu",
8
+ "n_bins": 64,
9
+ "min_depth": 1e-3,
10
+ "max_depth": 10.0
11
+ },
12
+ {
13
+ "name": "kitti",
14
+ "n_bins": 64,
15
+ "min_depth": 1e-3,
16
+ "max_depth": 80.0
17
+ }
18
+ ],
19
+ "bin_embedding_dim": 128,
20
+ "bin_centers_type": "softplus",
21
+ "n_attractors":[16, 8, 4, 1],
22
+ "attractor_alpha": 1000,
23
+ "attractor_gamma": 2,
24
+ "attractor_kind" : "mean",
25
+ "attractor_type" : "inv",
26
+ "min_temp": 0.0212,
27
+ "max_temp": 50.0,
28
+ "memory_efficient": true,
29
+ "midas_model_type" : "DPT_BEiT_L_384",
30
+ "img_size": [384, 512]
31
+ },
32
+
33
+ "train": {
34
+ "train_midas": true,
35
+ "use_pretrained_midas": true,
36
+ "trainer": "zoedepth_nk",
37
+ "epochs": 5,
38
+ "bs": 16,
39
+ "optim_kwargs": {"lr": 0.0002512, "wd": 0.01},
40
+ "sched_kwargs": {"div_factor": 1, "final_div_factor": 10000, "pct_start": 0.7, "three_phase":false, "cycle_momentum": true},
41
+ "same_lr": false,
42
+ "w_si": 1,
43
+ "w_domain": 100,
44
+ "avoid_boundary": false,
45
+ "random_crop": false,
46
+ "input_width": 640,
47
+ "input_height": 480,
48
+ "w_grad": 0,
49
+ "w_reg": 0,
50
+ "midas_lr_factor": 10,
51
+ "encoder_lr_factor":10,
52
+ "pos_enc_lr_factor":10
53
+ },
54
+
55
+ "infer": {
56
+ "train_midas": false,
57
+ "pretrained_resource": "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_NK.pt",
58
+ "use_pretrained_midas": false,
59
+ "force_keep_ar": true
60
+ },
61
+
62
+ "eval": {
63
+ "train_midas": false,
64
+ "pretrained_resource": "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_NK.pt",
65
+ "use_pretrained_midas": false
66
+ }
67
+ }
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/zoedepth_nk/zoedepth_nk_v1.py ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+
3
+ # Copyright (c) 2022 Intelligent Systems Lab Org
4
+
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+
12
+ # The above copyright notice and this permission notice shall be included in all
13
+ # copies or substantial portions of the Software.
14
+
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ # File author: Shariq Farooq Bhat
24
+
25
+ import itertools
26
+
27
+ import torch
28
+ import torch.nn as nn
29
+
30
+ from zoedepth.models.depth_model import DepthModel
31
+ from zoedepth.models.base_models.midas import MidasCore
32
+ from zoedepth.models.layers.attractor import AttractorLayer, AttractorLayerUnnormed
33
+ from zoedepth.models.layers.dist_layers import ConditionalLogBinomial
34
+ from zoedepth.models.layers.localbins_layers import (Projector, SeedBinRegressor,
35
+ SeedBinRegressorUnnormed)
36
+ from zoedepth.models.layers.patch_transformer import PatchTransformerEncoder
37
+ from zoedepth.models.model_io import load_state_from_resource
38
+
39
+
40
+ class ZoeDepthNK(DepthModel):
41
+ def __init__(self, core, bin_conf, bin_centers_type="softplus", bin_embedding_dim=128,
42
+ n_attractors=[16, 8, 4, 1], attractor_alpha=300, attractor_gamma=2, attractor_kind='sum', attractor_type='exp',
43
+ min_temp=5, max_temp=50,
44
+ memory_efficient=False, train_midas=True,
45
+ is_midas_pretrained=True, midas_lr_factor=1, encoder_lr_factor=10, pos_enc_lr_factor=10, inverse_midas=False, **kwargs):
46
+ """ZoeDepthNK model. This is the version of ZoeDepth that has two metric heads and uses a learned router to route to experts.
47
+
48
+ Args:
49
+ core (models.base_models.midas.MidasCore): The base midas model that is used for extraction of "relative" features
50
+
51
+ bin_conf (List[dict]): A list of dictionaries that contain the bin configuration for each metric head. Each dictionary should contain the following keys:
52
+ "name" (str, typically same as the dataset name), "n_bins" (int), "min_depth" (float), "max_depth" (float)
53
+
54
+ The length of this list determines the number of metric heads.
55
+ bin_centers_type (str, optional): "normed" or "softplus". Activation type used for bin centers. For "normed" bin centers, linear normalization trick is applied. This results in bounded bin centers.
56
+ For "softplus", softplus activation is used and thus are unbounded. Defaults to "normed".
57
+ bin_embedding_dim (int, optional): bin embedding dimension. Defaults to 128.
58
+
59
+ n_attractors (List[int], optional): Number of bin attractors at decoder layers. Defaults to [16, 8, 4, 1].
60
+ attractor_alpha (int, optional): Proportional attractor strength. Refer to models.layers.attractor for more details. Defaults to 300.
61
+ attractor_gamma (int, optional): Exponential attractor strength. Refer to models.layers.attractor for more details. Defaults to 2.
62
+ attractor_kind (str, optional): Attraction aggregation "sum" or "mean". Defaults to 'sum'.
63
+ attractor_type (str, optional): Type of attractor to use; "inv" (Inverse attractor) or "exp" (Exponential attractor). Defaults to 'exp'.
64
+
65
+ min_temp (int, optional): Lower bound for temperature of output probability distribution. Defaults to 5.
66
+ max_temp (int, optional): Upper bound for temperature of output probability distribution. Defaults to 50.
67
+
68
+ memory_efficient (bool, optional): Whether to use memory efficient version of attractor layers. Memory efficient version is slower but is recommended incase of multiple metric heads in order save GPU memory. Defaults to False.
69
+
70
+ train_midas (bool, optional): Whether to train "core", the base midas model. Defaults to True.
71
+ is_midas_pretrained (bool, optional): Is "core" pretrained? Defaults to True.
72
+ midas_lr_factor (int, optional): Learning rate reduction factor for base midas model except its encoder and positional encodings. Defaults to 10.
73
+ encoder_lr_factor (int, optional): Learning rate reduction factor for the encoder in midas model. Defaults to 10.
74
+ pos_enc_lr_factor (int, optional): Learning rate reduction factor for positional encodings in the base midas model. Defaults to 10.
75
+
76
+ """
77
+
78
+ super().__init__()
79
+
80
+ self.core = core
81
+ self.bin_conf = bin_conf
82
+ self.min_temp = min_temp
83
+ self.max_temp = max_temp
84
+ self.memory_efficient = memory_efficient
85
+ self.train_midas = train_midas
86
+ self.is_midas_pretrained = is_midas_pretrained
87
+ self.midas_lr_factor = midas_lr_factor
88
+ self.encoder_lr_factor = encoder_lr_factor
89
+ self.pos_enc_lr_factor = pos_enc_lr_factor
90
+ self.inverse_midas = inverse_midas
91
+
92
+ N_MIDAS_OUT = 32
93
+ btlnck_features = self.core.output_channels[0]
94
+ num_out_features = self.core.output_channels[1:]
95
+ # self.scales = [16, 8, 4, 2] # spatial scale factors
96
+
97
+ self.conv2 = nn.Conv2d(
98
+ btlnck_features, btlnck_features, kernel_size=1, stride=1, padding=0)
99
+
100
+ # Transformer classifier on the bottleneck
101
+ self.patch_transformer = PatchTransformerEncoder(
102
+ btlnck_features, 1, 128, use_class_token=True)
103
+ self.mlp_classifier = nn.Sequential(
104
+ nn.Linear(128, 128),
105
+ nn.ReLU(),
106
+ nn.Linear(128, 2)
107
+ )
108
+
109
+ if bin_centers_type == "normed":
110
+ SeedBinRegressorLayer = SeedBinRegressor
111
+ Attractor = AttractorLayer
112
+ elif bin_centers_type == "softplus":
113
+ SeedBinRegressorLayer = SeedBinRegressorUnnormed
114
+ Attractor = AttractorLayerUnnormed
115
+ elif bin_centers_type == "hybrid1":
116
+ SeedBinRegressorLayer = SeedBinRegressor
117
+ Attractor = AttractorLayerUnnormed
118
+ elif bin_centers_type == "hybrid2":
119
+ SeedBinRegressorLayer = SeedBinRegressorUnnormed
120
+ Attractor = AttractorLayer
121
+ else:
122
+ raise ValueError(
123
+ "bin_centers_type should be one of 'normed', 'softplus', 'hybrid1', 'hybrid2'")
124
+ self.bin_centers_type = bin_centers_type
125
+ # We have bins for each bin conf.
126
+ # Create a map (ModuleDict) of 'name' -> seed_bin_regressor
127
+ self.seed_bin_regressors = nn.ModuleDict(
128
+ {conf['name']: SeedBinRegressorLayer(btlnck_features, conf["n_bins"], mlp_dim=bin_embedding_dim//2, min_depth=conf["min_depth"], max_depth=conf["max_depth"])
129
+ for conf in bin_conf}
130
+ )
131
+
132
+ self.seed_projector = Projector(
133
+ btlnck_features, bin_embedding_dim, mlp_dim=bin_embedding_dim//2)
134
+ self.projectors = nn.ModuleList([
135
+ Projector(num_out, bin_embedding_dim, mlp_dim=bin_embedding_dim//2)
136
+ for num_out in num_out_features
137
+ ])
138
+
139
+ # Create a map (ModuleDict) of 'name' -> attractors (ModuleList)
140
+ self.attractors = nn.ModuleDict(
141
+ {conf['name']: nn.ModuleList([
142
+ Attractor(bin_embedding_dim, n_attractors[i],
143
+ mlp_dim=bin_embedding_dim, alpha=attractor_alpha,
144
+ gamma=attractor_gamma, kind=attractor_kind,
145
+ attractor_type=attractor_type, memory_efficient=memory_efficient,
146
+ min_depth=conf["min_depth"], max_depth=conf["max_depth"])
147
+ for i in range(len(n_attractors))
148
+ ])
149
+ for conf in bin_conf}
150
+ )
151
+
152
+ last_in = N_MIDAS_OUT
153
+ # conditional log binomial for each bin conf
154
+ self.conditional_log_binomial = nn.ModuleDict(
155
+ {conf['name']: ConditionalLogBinomial(last_in, bin_embedding_dim, conf['n_bins'], bottleneck_factor=4, min_temp=self.min_temp, max_temp=self.max_temp)
156
+ for conf in bin_conf}
157
+ )
158
+
159
+ def forward(self, x, return_final_centers=False, denorm=False, return_probs=False, **kwargs):
160
+ """
161
+ Args:
162
+ x (torch.Tensor): Input image tensor of shape (B, C, H, W). Assumes all images are from the same domain.
163
+ return_final_centers (bool, optional): Whether to return the final centers of the attractors. Defaults to False.
164
+ denorm (bool, optional): Whether to denormalize the input image. Defaults to False.
165
+ return_probs (bool, optional): Whether to return the probabilities of the bins. Defaults to False.
166
+
167
+ Returns:
168
+ dict: Dictionary of outputs with keys:
169
+ - "rel_depth": Relative depth map of shape (B, 1, H, W)
170
+ - "metric_depth": Metric depth map of shape (B, 1, H, W)
171
+ - "domain_logits": Domain logits of shape (B, 2)
172
+ - "bin_centers": Bin centers of shape (B, N, H, W). Present only if return_final_centers is True
173
+ - "probs": Bin probabilities of shape (B, N, H, W). Present only if return_probs is True
174
+ """
175
+ b, c, h, w = x.shape
176
+ self.orig_input_width = w
177
+ self.orig_input_height = h
178
+ rel_depth, out = self.core(x, denorm=denorm, return_rel_depth=True)
179
+
180
+ outconv_activation = out[0]
181
+ btlnck = out[1]
182
+ x_blocks = out[2:]
183
+
184
+ x_d0 = self.conv2(btlnck)
185
+ x = x_d0
186
+
187
+ # Predict which path to take
188
+ embedding = self.patch_transformer(x)[0] # N, E
189
+ domain_logits = self.mlp_classifier(embedding) # N, 2
190
+ domain_vote = torch.softmax(domain_logits.sum(
191
+ dim=0, keepdim=True), dim=-1) # 1, 2
192
+
193
+ # Get the path
194
+ bin_conf_name = ["nyu", "kitti"][torch.argmax(
195
+ domain_vote, dim=-1).squeeze().item()]
196
+
197
+ try:
198
+ conf = [c for c in self.bin_conf if c.name == bin_conf_name][0]
199
+ except IndexError:
200
+ raise ValueError(
201
+ f"bin_conf_name {bin_conf_name} not found in bin_confs")
202
+
203
+ min_depth = conf['min_depth']
204
+ max_depth = conf['max_depth']
205
+
206
+ seed_bin_regressor = self.seed_bin_regressors[bin_conf_name]
207
+ _, seed_b_centers = seed_bin_regressor(x)
208
+ if self.bin_centers_type == 'normed' or self.bin_centers_type == 'hybrid2':
209
+ b_prev = (seed_b_centers - min_depth)/(max_depth - min_depth)
210
+ else:
211
+ b_prev = seed_b_centers
212
+ prev_b_embedding = self.seed_projector(x)
213
+
214
+ attractors = self.attractors[bin_conf_name]
215
+ for projector, attractor, x in zip(self.projectors, attractors, x_blocks):
216
+ b_embedding = projector(x)
217
+ b, b_centers = attractor(
218
+ b_embedding, b_prev, prev_b_embedding, interpolate=True)
219
+ b_prev = b
220
+ prev_b_embedding = b_embedding
221
+
222
+ last = outconv_activation
223
+
224
+ b_centers = nn.functional.interpolate(
225
+ b_centers, last.shape[-2:], mode='bilinear', align_corners=True)
226
+ b_embedding = nn.functional.interpolate(
227
+ b_embedding, last.shape[-2:], mode='bilinear', align_corners=True)
228
+
229
+ clb = self.conditional_log_binomial[bin_conf_name]
230
+ x = clb(last, b_embedding)
231
+
232
+ # Now depth value is Sum px * cx , where cx are bin_centers from the last bin tensor
233
+ # print(x.shape, b_centers.shape)
234
+ # b_centers = nn.functional.interpolate(b_centers, x.shape[-2:], mode='bilinear', align_corners=True)
235
+ out = torch.sum(x * b_centers, dim=1, keepdim=True)
236
+
237
+ output = dict(domain_logits=domain_logits, metric_depth=out)
238
+ if return_final_centers or return_probs:
239
+ output['bin_centers'] = b_centers
240
+
241
+ if return_probs:
242
+ output['probs'] = x
243
+ return output
244
+
245
+ def get_lr_params(self, lr):
246
+ """
247
+ Learning rate configuration for different layers of the model
248
+
249
+ Args:
250
+ lr (float) : Base learning rate
251
+ Returns:
252
+ list : list of parameters to optimize and their learning rates, in the format required by torch optimizers.
253
+ """
254
+ param_conf = []
255
+ if self.train_midas:
256
+ def get_rel_pos_params():
257
+ for name, p in self.core.core.pretrained.named_parameters():
258
+ if "relative_position" in name:
259
+ yield p
260
+
261
+ def get_enc_params_except_rel_pos():
262
+ for name, p in self.core.core.pretrained.named_parameters():
263
+ if "relative_position" not in name:
264
+ yield p
265
+
266
+ encoder_params = get_enc_params_except_rel_pos()
267
+ rel_pos_params = get_rel_pos_params()
268
+ midas_params = self.core.core.scratch.parameters()
269
+ midas_lr_factor = self.midas_lr_factor if self.is_midas_pretrained else 1.0
270
+ param_conf.extend([
271
+ {'params': encoder_params, 'lr': lr / self.encoder_lr_factor},
272
+ {'params': rel_pos_params, 'lr': lr / self.pos_enc_lr_factor},
273
+ {'params': midas_params, 'lr': lr / midas_lr_factor}
274
+ ])
275
+
276
+ remaining_modules = []
277
+ for name, child in self.named_children():
278
+ if name != 'core':
279
+ remaining_modules.append(child)
280
+ remaining_params = itertools.chain(
281
+ *[child.parameters() for child in remaining_modules])
282
+ param_conf.append({'params': remaining_params, 'lr': lr})
283
+ return param_conf
284
+
285
+ def get_conf_parameters(self, conf_name):
286
+ """
287
+ Returns parameters of all the ModuleDicts children that are exclusively used for the given bin configuration
288
+ """
289
+ params = []
290
+ for name, child in self.named_children():
291
+ if isinstance(child, nn.ModuleDict):
292
+ for bin_conf_name, module in child.items():
293
+ if bin_conf_name == conf_name:
294
+ params += list(module.parameters())
295
+ return params
296
+
297
+ def freeze_conf(self, conf_name):
298
+ """
299
+ Freezes all the parameters of all the ModuleDicts children that are exclusively used for the given bin configuration
300
+ """
301
+ for p in self.get_conf_parameters(conf_name):
302
+ p.requires_grad = False
303
+
304
+ def unfreeze_conf(self, conf_name):
305
+ """
306
+ Unfreezes all the parameters of all the ModuleDicts children that are exclusively used for the given bin configuration
307
+ """
308
+ for p in self.get_conf_parameters(conf_name):
309
+ p.requires_grad = True
310
+
311
+ def freeze_all_confs(self):
312
+ """
313
+ Freezes all the parameters of all the ModuleDicts children
314
+ """
315
+ for name, child in self.named_children():
316
+ if isinstance(child, nn.ModuleDict):
317
+ for bin_conf_name, module in child.items():
318
+ for p in module.parameters():
319
+ p.requires_grad = False
320
+
321
+ @staticmethod
322
+ def build(midas_model_type="DPT_BEiT_L_384", pretrained_resource=None, use_pretrained_midas=False, train_midas=False, freeze_midas_bn=True, **kwargs):
323
+ core = MidasCore.build(midas_model_type=midas_model_type, use_pretrained_midas=use_pretrained_midas,
324
+ train_midas=train_midas, fetch_features=True, freeze_bn=freeze_midas_bn, **kwargs)
325
+ model = ZoeDepthNK(core, **kwargs)
326
+ if pretrained_resource:
327
+ assert isinstance(pretrained_resource, str), "pretrained_resource must be a string"
328
+ model = load_state_from_resource(model, pretrained_resource)
329
+ return model
330
+
331
+ @staticmethod
332
+ def build_from_config(config):
333
+ return ZoeDepthNK.build(**config)
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/utils/__init__.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+
3
+ # Copyright (c) 2022 Intelligent Systems Lab Org
4
+
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+
12
+ # The above copyright notice and this permission notice shall be included in all
13
+ # copies or substantial portions of the Software.
14
+
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ # File author: Shariq Farooq Bhat
24
+
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/utils/arg_utils.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ def infer_type(x): # hacky way to infer type from string args
4
+ if not isinstance(x, str):
5
+ return x
6
+
7
+ try:
8
+ x = int(x)
9
+ return x
10
+ except ValueError:
11
+ pass
12
+
13
+ try:
14
+ x = float(x)
15
+ return x
16
+ except ValueError:
17
+ pass
18
+
19
+ return x
20
+
21
+
22
+ def parse_unknown(unknown_args):
23
+ clean = []
24
+ for a in unknown_args:
25
+ if "=" in a:
26
+ k, v = a.split("=")
27
+ clean.extend([k, v])
28
+ else:
29
+ clean.append(a)
30
+
31
+ keys = clean[::2]
32
+ values = clean[1::2]
33
+ return {k.replace("--", ""): infer_type(v) for k, v in zip(keys, values)}
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/utils/config.py ADDED
@@ -0,0 +1,437 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+
3
+ # Copyright (c) 2022 Intelligent Systems Lab Org
4
+
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+
12
+ # The above copyright notice and this permission notice shall be included in all
13
+ # copies or substantial portions of the Software.
14
+
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ # File author: Shariq Farooq Bhat
24
+
25
+ import json
26
+ import os
27
+
28
+ from .easydict import EasyDict as edict
29
+ from .arg_utils import infer_type
30
+
31
+ import pathlib
32
+ import platform
33
+
34
+ ROOT = pathlib.Path(__file__).parent.parent.resolve()
35
+
36
+ HOME_DIR = os.path.expanduser("~")
37
+
38
+ COMMON_CONFIG = {
39
+ "save_dir": os.path.expanduser("~/shortcuts/monodepth3_checkpoints"),
40
+ "project": "ZoeDepth",
41
+ "tags": '',
42
+ "notes": "",
43
+ "gpu": None,
44
+ "root": ".",
45
+ "uid": None,
46
+ "print_losses": False
47
+ }
48
+
49
+ DATASETS_CONFIG = {
50
+ "kitti": {
51
+ "dataset": "kitti",
52
+ "min_depth": 0.001,
53
+ "max_depth": 80,
54
+ "data_path": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/raw"),
55
+ "gt_path": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/gts"),
56
+ "filenames_file": "./train_test_inputs/kitti_eigen_train_files_with_gt.txt",
57
+ "input_height": 352,
58
+ "input_width": 1216, # 704
59
+ "data_path_eval": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/raw"),
60
+ "gt_path_eval": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/gts"),
61
+ "filenames_file_eval": "./train_test_inputs/kitti_eigen_test_files_with_gt.txt",
62
+
63
+ "min_depth_eval": 1e-3,
64
+ "max_depth_eval": 80,
65
+
66
+ "do_random_rotate": True,
67
+ "degree": 1.0,
68
+ "do_kb_crop": True,
69
+ "garg_crop": True,
70
+ "eigen_crop": False,
71
+ "use_right": False
72
+ },
73
+ "kitti_test": {
74
+ "dataset": "kitti",
75
+ "min_depth": 0.001,
76
+ "max_depth": 80,
77
+ "data_path": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/raw"),
78
+ "gt_path": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/gts"),
79
+ "filenames_file": "./train_test_inputs/kitti_eigen_train_files_with_gt.txt",
80
+ "input_height": 352,
81
+ "input_width": 1216,
82
+ "data_path_eval": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/raw"),
83
+ "gt_path_eval": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/gts"),
84
+ "filenames_file_eval": "./train_test_inputs/kitti_eigen_test_files_with_gt.txt",
85
+
86
+ "min_depth_eval": 1e-3,
87
+ "max_depth_eval": 80,
88
+
89
+ "do_random_rotate": False,
90
+ "degree": 1.0,
91
+ "do_kb_crop": True,
92
+ "garg_crop": True,
93
+ "eigen_crop": False,
94
+ "use_right": False
95
+ },
96
+ "nyu": {
97
+ "dataset": "nyu",
98
+ "avoid_boundary": False,
99
+ "min_depth": 1e-3, # originally 0.1
100
+ "max_depth": 10,
101
+ "data_path": os.path.join(HOME_DIR, "shortcuts/datasets/nyu_depth_v2/sync/"),
102
+ "gt_path": os.path.join(HOME_DIR, "shortcuts/datasets/nyu_depth_v2/sync/"),
103
+ "filenames_file": "./train_test_inputs/nyudepthv2_train_files_with_gt.txt",
104
+ "input_height": 480,
105
+ "input_width": 640,
106
+ "data_path_eval": os.path.join(HOME_DIR, "shortcuts/datasets/nyu_depth_v2/official_splits/test/"),
107
+ "gt_path_eval": os.path.join(HOME_DIR, "shortcuts/datasets/nyu_depth_v2/official_splits/test/"),
108
+ "filenames_file_eval": "./train_test_inputs/nyudepthv2_test_files_with_gt.txt",
109
+ "min_depth_eval": 1e-3,
110
+ "max_depth_eval": 10,
111
+ "min_depth_diff": -10,
112
+ "max_depth_diff": 10,
113
+
114
+ "do_random_rotate": True,
115
+ "degree": 1.0,
116
+ "do_kb_crop": False,
117
+ "garg_crop": False,
118
+ "eigen_crop": True
119
+ },
120
+ "ibims": {
121
+ "dataset": "ibims",
122
+ "ibims_root": os.path.join(HOME_DIR, "shortcuts/datasets/ibims/ibims1_core_raw/"),
123
+ "eigen_crop": True,
124
+ "garg_crop": False,
125
+ "do_kb_crop": False,
126
+ "min_depth_eval": 0,
127
+ "max_depth_eval": 10,
128
+ "min_depth": 1e-3,
129
+ "max_depth": 10
130
+ },
131
+ "sunrgbd": {
132
+ "dataset": "sunrgbd",
133
+ "sunrgbd_root": os.path.join(HOME_DIR, "shortcuts/datasets/SUNRGBD/test/"),
134
+ "eigen_crop": True,
135
+ "garg_crop": False,
136
+ "do_kb_crop": False,
137
+ "min_depth_eval": 0,
138
+ "max_depth_eval": 8,
139
+ "min_depth": 1e-3,
140
+ "max_depth": 10
141
+ },
142
+ "diml_indoor": {
143
+ "dataset": "diml_indoor",
144
+ "diml_indoor_root": os.path.join(HOME_DIR, "shortcuts/datasets/diml_indoor_test/"),
145
+ "eigen_crop": True,
146
+ "garg_crop": False,
147
+ "do_kb_crop": False,
148
+ "min_depth_eval": 0,
149
+ "max_depth_eval": 10,
150
+ "min_depth": 1e-3,
151
+ "max_depth": 10
152
+ },
153
+ "diml_outdoor": {
154
+ "dataset": "diml_outdoor",
155
+ "diml_outdoor_root": os.path.join(HOME_DIR, "shortcuts/datasets/diml_outdoor_test/"),
156
+ "eigen_crop": False,
157
+ "garg_crop": True,
158
+ "do_kb_crop": False,
159
+ "min_depth_eval": 2,
160
+ "max_depth_eval": 80,
161
+ "min_depth": 1e-3,
162
+ "max_depth": 80
163
+ },
164
+ "diode_indoor": {
165
+ "dataset": "diode_indoor",
166
+ "diode_indoor_root": os.path.join(HOME_DIR, "shortcuts/datasets/diode_indoor/"),
167
+ "eigen_crop": True,
168
+ "garg_crop": False,
169
+ "do_kb_crop": False,
170
+ "min_depth_eval": 1e-3,
171
+ "max_depth_eval": 10,
172
+ "min_depth": 1e-3,
173
+ "max_depth": 10
174
+ },
175
+ "diode_outdoor": {
176
+ "dataset": "diode_outdoor",
177
+ "diode_outdoor_root": os.path.join(HOME_DIR, "shortcuts/datasets/diode_outdoor/"),
178
+ "eigen_crop": False,
179
+ "garg_crop": True,
180
+ "do_kb_crop": False,
181
+ "min_depth_eval": 1e-3,
182
+ "max_depth_eval": 80,
183
+ "min_depth": 1e-3,
184
+ "max_depth": 80
185
+ },
186
+ "hypersim_test": {
187
+ "dataset": "hypersim_test",
188
+ "hypersim_test_root": os.path.join(HOME_DIR, "shortcuts/datasets/hypersim_test/"),
189
+ "eigen_crop": True,
190
+ "garg_crop": False,
191
+ "do_kb_crop": False,
192
+ "min_depth_eval": 1e-3,
193
+ "max_depth_eval": 80,
194
+ "min_depth": 1e-3,
195
+ "max_depth": 10
196
+ },
197
+ "vkitti": {
198
+ "dataset": "vkitti",
199
+ "vkitti_root": os.path.join(HOME_DIR, "shortcuts/datasets/vkitti_test/"),
200
+ "eigen_crop": False,
201
+ "garg_crop": True,
202
+ "do_kb_crop": True,
203
+ "min_depth_eval": 1e-3,
204
+ "max_depth_eval": 80,
205
+ "min_depth": 1e-3,
206
+ "max_depth": 80
207
+ },
208
+ "vkitti2": {
209
+ "dataset": "vkitti2",
210
+ "vkitti2_root": os.path.join(HOME_DIR, "shortcuts/datasets/vkitti2/"),
211
+ "eigen_crop": False,
212
+ "garg_crop": True,
213
+ "do_kb_crop": True,
214
+ "min_depth_eval": 1e-3,
215
+ "max_depth_eval": 80,
216
+ "min_depth": 1e-3,
217
+ "max_depth": 80,
218
+ },
219
+ "ddad": {
220
+ "dataset": "ddad",
221
+ "ddad_root": os.path.join(HOME_DIR, "shortcuts/datasets/ddad/ddad_val/"),
222
+ "eigen_crop": False,
223
+ "garg_crop": True,
224
+ "do_kb_crop": True,
225
+ "min_depth_eval": 1e-3,
226
+ "max_depth_eval": 80,
227
+ "min_depth": 1e-3,
228
+ "max_depth": 80,
229
+ },
230
+ }
231
+
232
+ ALL_INDOOR = ["nyu", "ibims", "sunrgbd", "diode_indoor", "hypersim_test"]
233
+ ALL_OUTDOOR = ["kitti", "diml_outdoor", "diode_outdoor", "vkitti2", "ddad"]
234
+ ALL_EVAL_DATASETS = ALL_INDOOR + ALL_OUTDOOR
235
+
236
+ COMMON_TRAINING_CONFIG = {
237
+ "dataset": "nyu",
238
+ "distributed": True,
239
+ "workers": 16,
240
+ "clip_grad": 0.1,
241
+ "use_shared_dict": False,
242
+ "shared_dict": None,
243
+ "use_amp": False,
244
+
245
+ "aug": True,
246
+ "random_crop": False,
247
+ "random_translate": False,
248
+ "translate_prob": 0.2,
249
+ "max_translation": 100,
250
+
251
+ "validate_every": 0.25,
252
+ "log_images_every": 0.1,
253
+ "prefetch": False,
254
+ }
255
+
256
+
257
+ def flatten(config, except_keys=('bin_conf')):
258
+ def recurse(inp):
259
+ if isinstance(inp, dict):
260
+ for key, value in inp.items():
261
+ if key in except_keys:
262
+ yield (key, value)
263
+ if isinstance(value, dict):
264
+ yield from recurse(value)
265
+ else:
266
+ yield (key, value)
267
+
268
+ return dict(list(recurse(config)))
269
+
270
+
271
+ def split_combined_args(kwargs):
272
+ """Splits the arguments that are combined with '__' into multiple arguments.
273
+ Combined arguments should have equal number of keys and values.
274
+ Keys are separated by '__' and Values are separated with ';'.
275
+ For example, '__n_bins__lr=256;0.001'
276
+
277
+ Args:
278
+ kwargs (dict): key-value pairs of arguments where key-value is optionally combined according to the above format.
279
+
280
+ Returns:
281
+ dict: Parsed dict with the combined arguments split into individual key-value pairs.
282
+ """
283
+ new_kwargs = dict(kwargs)
284
+ for key, value in kwargs.items():
285
+ if key.startswith("__"):
286
+ keys = key.split("__")[1:]
287
+ values = value.split(";")
288
+ assert len(keys) == len(
289
+ values), f"Combined arguments should have equal number of keys and values. Keys are separated by '__' and Values are separated with ';'. For example, '__n_bins__lr=256;0.001. Given (keys,values) is ({keys}, {values})"
290
+ for k, v in zip(keys, values):
291
+ new_kwargs[k] = v
292
+ return new_kwargs
293
+
294
+
295
+ def parse_list(config, key, dtype=int):
296
+ """Parse a list of values for the key if the value is a string. The values are separated by a comma.
297
+ Modifies the config in place.
298
+ """
299
+ if key in config:
300
+ if isinstance(config[key], str):
301
+ config[key] = list(map(dtype, config[key].split(',')))
302
+ assert isinstance(config[key], list) and all([isinstance(e, dtype) for e in config[key]]
303
+ ), f"{key} should be a list of values dtype {dtype}. Given {config[key]} of type {type(config[key])} with values of type {[type(e) for e in config[key]]}."
304
+
305
+
306
+ def get_model_config(model_name, model_version=None):
307
+ """Find and parse the .json config file for the model.
308
+
309
+ Args:
310
+ model_name (str): name of the model. The config file should be named config_{model_name}[_{model_version}].json under the models/{model_name} directory.
311
+ model_version (str, optional): Specific config version. If specified config_{model_name}_{model_version}.json is searched for and used. Otherwise config_{model_name}.json is used. Defaults to None.
312
+
313
+ Returns:
314
+ easydict: the config dictionary for the model.
315
+ """
316
+ config_fname = f"config_{model_name}_{model_version}.json" if model_version is not None else f"config_{model_name}.json"
317
+ config_file = os.path.join(ROOT, "models", model_name, config_fname)
318
+ if not os.path.exists(config_file):
319
+ return None
320
+
321
+ with open(config_file, "r") as f:
322
+ config = edict(json.load(f))
323
+
324
+ # handle dictionary inheritance
325
+ # only training config is supported for inheritance
326
+ if "inherit" in config.train and config.train.inherit is not None:
327
+ inherit_config = get_model_config(config.train["inherit"]).train
328
+ for key, value in inherit_config.items():
329
+ if key not in config.train:
330
+ config.train[key] = value
331
+ return edict(config)
332
+
333
+
334
+ def update_model_config(config, mode, model_name, model_version=None, strict=False):
335
+ model_config = get_model_config(model_name, model_version)
336
+ if model_config is not None:
337
+ config = {**config, **
338
+ flatten({**model_config.model, **model_config[mode]})}
339
+ elif strict:
340
+ raise ValueError(f"Config file for model {model_name} not found.")
341
+ return config
342
+
343
+
344
+ def check_choices(name, value, choices):
345
+ # return # No checks in dev branch
346
+ if value not in choices:
347
+ raise ValueError(f"{name} {value} not in supported choices {choices}")
348
+
349
+
350
+ KEYS_TYPE_BOOL = ["use_amp", "distributed", "use_shared_dict", "same_lr", "aug", "three_phase",
351
+ "prefetch", "cycle_momentum"] # Casting is not necessary as their int casted values in config are 0 or 1
352
+
353
+
354
+ def get_config(model_name, mode='train', dataset=None, **overwrite_kwargs):
355
+ """Main entry point to get the config for the model.
356
+
357
+ Args:
358
+ model_name (str): name of the desired model.
359
+ mode (str, optional): "train" or "infer". Defaults to 'train'.
360
+ dataset (str, optional): If specified, the corresponding dataset configuration is loaded as well. Defaults to None.
361
+
362
+ Keyword Args: key-value pairs of arguments to overwrite the default config.
363
+
364
+ The order of precedence for overwriting the config is (Higher precedence first):
365
+ # 1. overwrite_kwargs
366
+ # 2. "config_version": Config file version if specified in overwrite_kwargs. The corresponding config loaded is config_{model_name}_{config_version}.json
367
+ # 3. "version_name": Default Model version specific config specified in overwrite_kwargs. The corresponding config loaded is config_{model_name}_{version_name}.json
368
+ # 4. common_config: Default config for all models specified in COMMON_CONFIG
369
+
370
+ Returns:
371
+ easydict: The config dictionary for the model.
372
+ """
373
+
374
+
375
+ check_choices("Model", model_name, ["zoedepth", "zoedepth_nk"])
376
+ check_choices("Mode", mode, ["train", "infer", "eval"])
377
+ if mode == "train":
378
+ check_choices("Dataset", dataset, ["nyu", "kitti", "mix", None])
379
+
380
+ config = flatten({**COMMON_CONFIG, **COMMON_TRAINING_CONFIG})
381
+ config = update_model_config(config, mode, model_name)
382
+
383
+ # update with model version specific config
384
+ version_name = overwrite_kwargs.get("version_name", config["version_name"])
385
+ config = update_model_config(config, mode, model_name, version_name)
386
+
387
+ # update with config version if specified
388
+ config_version = overwrite_kwargs.get("config_version", None)
389
+ if config_version is not None:
390
+ print("Overwriting config with config_version", config_version)
391
+ config = update_model_config(config, mode, model_name, config_version)
392
+
393
+ # update with overwrite_kwargs
394
+ # Combined args are useful for hyperparameter search
395
+ overwrite_kwargs = split_combined_args(overwrite_kwargs)
396
+ config = {**config, **overwrite_kwargs}
397
+
398
+ # Casting to bool # TODO: Not necessary. Remove and test
399
+ for key in KEYS_TYPE_BOOL:
400
+ if key in config:
401
+ config[key] = bool(config[key])
402
+
403
+ # Model specific post processing of config
404
+ parse_list(config, "n_attractors")
405
+
406
+ # adjust n_bins for each bin configuration if bin_conf is given and n_bins is passed in overwrite_kwargs
407
+ if 'bin_conf' in config and 'n_bins' in overwrite_kwargs:
408
+ bin_conf = config['bin_conf'] # list of dicts
409
+ n_bins = overwrite_kwargs['n_bins']
410
+ new_bin_conf = []
411
+ for conf in bin_conf:
412
+ conf['n_bins'] = n_bins
413
+ new_bin_conf.append(conf)
414
+ config['bin_conf'] = new_bin_conf
415
+
416
+ if mode == "train":
417
+ orig_dataset = dataset
418
+ if dataset == "mix":
419
+ dataset = 'nyu' # Use nyu as default for mix. Dataset config is changed accordingly while loading the dataloader
420
+ if dataset is not None:
421
+ config['project'] = f"MonoDepth3-{orig_dataset}" # Set project for wandb
422
+
423
+ if dataset is not None:
424
+ config['dataset'] = dataset
425
+ config = {**DATASETS_CONFIG[dataset], **config}
426
+
427
+
428
+ config['model'] = model_name
429
+ typed_config = {k: infer_type(v) for k, v in config.items()}
430
+ # add hostname to config
431
+ config['hostname'] = platform.node()
432
+ return edict(typed_config)
433
+
434
+
435
+ def change_dataset(config, new_dataset):
436
+ config.update(DATASETS_CONFIG[new_dataset])
437
+ return config
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/utils/easydict/__init__.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ EasyDict
3
+ Copy/pasted from https://github.com/makinacorpus/easydict
4
+ Original author: Mathieu Leplatre <mathieu.leplatre@makina-corpus.com>
5
+ """
6
+
7
+ class EasyDict(dict):
8
+ """
9
+ Get attributes
10
+
11
+ >>> d = EasyDict({'foo':3})
12
+ >>> d['foo']
13
+ 3
14
+ >>> d.foo
15
+ 3
16
+ >>> d.bar
17
+ Traceback (most recent call last):
18
+ ...
19
+ AttributeError: 'EasyDict' object has no attribute 'bar'
20
+
21
+ Works recursively
22
+
23
+ >>> d = EasyDict({'foo':3, 'bar':{'x':1, 'y':2}})
24
+ >>> isinstance(d.bar, dict)
25
+ True
26
+ >>> d.bar.x
27
+ 1
28
+
29
+ Bullet-proof
30
+
31
+ >>> EasyDict({})
32
+ {}
33
+ >>> EasyDict(d={})
34
+ {}
35
+ >>> EasyDict(None)
36
+ {}
37
+ >>> d = {'a': 1}
38
+ >>> EasyDict(**d)
39
+ {'a': 1}
40
+ >>> EasyDict((('a', 1), ('b', 2)))
41
+ {'a': 1, 'b': 2}
42
+
43
+ Set attributes
44
+
45
+ >>> d = EasyDict()
46
+ >>> d.foo = 3
47
+ >>> d.foo
48
+ 3
49
+ >>> d.bar = {'prop': 'value'}
50
+ >>> d.bar.prop
51
+ 'value'
52
+ >>> d
53
+ {'foo': 3, 'bar': {'prop': 'value'}}
54
+ >>> d.bar.prop = 'newer'
55
+ >>> d.bar.prop
56
+ 'newer'
57
+
58
+
59
+ Values extraction
60
+
61
+ >>> d = EasyDict({'foo':0, 'bar':[{'x':1, 'y':2}, {'x':3, 'y':4}]})
62
+ >>> isinstance(d.bar, list)
63
+ True
64
+ >>> from operator import attrgetter
65
+ >>> list(map(attrgetter('x'), d.bar))
66
+ [1, 3]
67
+ >>> list(map(attrgetter('y'), d.bar))
68
+ [2, 4]
69
+ >>> d = EasyDict()
70
+ >>> list(d.keys())
71
+ []
72
+ >>> d = EasyDict(foo=3, bar=dict(x=1, y=2))
73
+ >>> d.foo
74
+ 3
75
+ >>> d.bar.x
76
+ 1
77
+
78
+ Still like a dict though
79
+
80
+ >>> o = EasyDict({'clean':True})
81
+ >>> list(o.items())
82
+ [('clean', True)]
83
+
84
+ And like a class
85
+
86
+ >>> class Flower(EasyDict):
87
+ ... power = 1
88
+ ...
89
+ >>> f = Flower()
90
+ >>> f.power
91
+ 1
92
+ >>> f = Flower({'height': 12})
93
+ >>> f.height
94
+ 12
95
+ >>> f['power']
96
+ 1
97
+ >>> sorted(f.keys())
98
+ ['height', 'power']
99
+
100
+ update and pop items
101
+ >>> d = EasyDict(a=1, b='2')
102
+ >>> e = EasyDict(c=3.0, a=9.0)
103
+ >>> d.update(e)
104
+ >>> d.c
105
+ 3.0
106
+ >>> d['c']
107
+ 3.0
108
+ >>> d.get('c')
109
+ 3.0
110
+ >>> d.update(a=4, b=4)
111
+ >>> d.b
112
+ 4
113
+ >>> d.pop('a')
114
+ 4
115
+ >>> d.a
116
+ Traceback (most recent call last):
117
+ ...
118
+ AttributeError: 'EasyDict' object has no attribute 'a'
119
+ """
120
+ def __init__(self, d=None, **kwargs):
121
+ if d is None:
122
+ d = {}
123
+ else:
124
+ d = dict(d)
125
+ if kwargs:
126
+ d.update(**kwargs)
127
+ for k, v in d.items():
128
+ setattr(self, k, v)
129
+ # Class attributes
130
+ for k in self.__class__.__dict__.keys():
131
+ if not (k.startswith('__') and k.endswith('__')) and not k in ('update', 'pop'):
132
+ setattr(self, k, getattr(self, k))
133
+
134
+ def __setattr__(self, name, value):
135
+ if isinstance(value, (list, tuple)):
136
+ value = [self.__class__(x)
137
+ if isinstance(x, dict) else x for x in value]
138
+ elif isinstance(value, dict) and not isinstance(value, self.__class__):
139
+ value = self.__class__(value)
140
+ super(EasyDict, self).__setattr__(name, value)
141
+ super(EasyDict, self).__setitem__(name, value)
142
+
143
+ __setitem__ = __setattr__
144
+
145
+ def update(self, e=None, **f):
146
+ d = e or dict()
147
+ d.update(f)
148
+ for k in d:
149
+ setattr(self, k, d[k])
150
+
151
+ def pop(self, k, d=None):
152
+ delattr(self, k)
153
+ return super(EasyDict, self).pop(k, d)
154
+
155
+
156
+ if __name__ == "__main__":
157
+ import doctest
158
+ doctest.testmod()
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/utils/geometry.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+
3
+ # Copyright (c) 2022 Intelligent Systems Lab Org
4
+
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+
12
+ # The above copyright notice and this permission notice shall be included in all
13
+ # copies or substantial portions of the Software.
14
+
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ # File author: Shariq Farooq Bhat
24
+
25
+ import numpy as np
26
+
27
+ def get_intrinsics(H,W):
28
+ """
29
+ Intrinsics for a pinhole camera model.
30
+ Assume fov of 55 degrees and central principal point.
31
+ """
32
+ f = 0.5 * W / np.tan(0.5 * 55 * np.pi / 180.0)
33
+ cx = 0.5 * W
34
+ cy = 0.5 * H
35
+ return np.array([[f, 0, cx],
36
+ [0, f, cy],
37
+ [0, 0, 1]])
38
+
39
+ def depth_to_points(depth, R=None, t=None):
40
+
41
+ K = get_intrinsics(depth.shape[1], depth.shape[2])
42
+ Kinv = np.linalg.inv(K)
43
+ if R is None:
44
+ R = np.eye(3)
45
+ if t is None:
46
+ t = np.zeros(3)
47
+
48
+ # M converts from your coordinate to PyTorch3D's coordinate system
49
+ M = np.eye(3)
50
+ M[0, 0] = -1.0
51
+ M[1, 1] = -1.0
52
+
53
+ height, width = depth.shape[1:3]
54
+
55
+ x = np.arange(width)
56
+ y = np.arange(height)
57
+ coord = np.stack(np.meshgrid(x, y), -1)
58
+ coord = np.concatenate((coord, np.ones_like(coord)[:, :, [0]]), -1) # z=1
59
+ coord = coord.astype(np.float32)
60
+ # coord = torch.as_tensor(coord, dtype=torch.float32, device=device)
61
+ coord = coord[None] # bs, h, w, 3
62
+
63
+ D = depth[:, :, :, None, None]
64
+ # print(D.shape, Kinv[None, None, None, ...].shape, coord[:, :, :, :, None].shape )
65
+ pts3D_1 = D * Kinv[None, None, None, ...] @ coord[:, :, :, :, None]
66
+ # pts3D_1 live in your coordinate system. Convert them to Py3D's
67
+ pts3D_1 = M[None, None, None, ...] @ pts3D_1
68
+ # from reference to targe tviewpoint
69
+ pts3D_2 = R[None, None, None, ...] @ pts3D_1 + t[None, None, None, :, None]
70
+ # pts3D_2 = pts3D_1
71
+ # depth_2 = pts3D_2[:, :, :, 2, :] # b,1,h,w
72
+ return pts3D_2[:, :, :, :3, 0][0]
73
+
74
+
75
+ def create_triangles(h, w, mask=None):
76
+ """
77
+ Reference: https://github.com/google-research/google-research/blob/e96197de06613f1b027d20328e06d69829fa5a89/infinite_nature/render_utils.py#L68
78
+ Creates mesh triangle indices from a given pixel grid size.
79
+ This function is not and need not be differentiable as triangle indices are
80
+ fixed.
81
+ Args:
82
+ h: (int) denoting the height of the image.
83
+ w: (int) denoting the width of the image.
84
+ Returns:
85
+ triangles: 2D numpy array of indices (int) with shape (2(W-1)(H-1) x 3)
86
+ """
87
+ x, y = np.meshgrid(range(w - 1), range(h - 1))
88
+ tl = y * w + x
89
+ tr = y * w + x + 1
90
+ bl = (y + 1) * w + x
91
+ br = (y + 1) * w + x + 1
92
+ triangles = np.array([tl, bl, tr, br, tr, bl])
93
+ triangles = np.transpose(triangles, (1, 2, 0)).reshape(
94
+ ((w - 1) * (h - 1) * 2, 3))
95
+ if mask is not None:
96
+ mask = mask.reshape(-1)
97
+ triangles = triangles[mask[triangles].all(1)]
98
+ return triangles
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/utils/misc.py ADDED
@@ -0,0 +1,368 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+
3
+ # Copyright (c) 2022 Intelligent Systems Lab Org
4
+
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+
12
+ # The above copyright notice and this permission notice shall be included in all
13
+ # copies or substantial portions of the Software.
14
+
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ # File author: Shariq Farooq Bhat
24
+
25
+ """Miscellaneous utility functions."""
26
+
27
+ from scipy import ndimage
28
+
29
+ import base64
30
+ import math
31
+ import re
32
+ from io import BytesIO
33
+
34
+ import matplotlib
35
+ import matplotlib.cm
36
+ import numpy as np
37
+ import requests
38
+ import torch
39
+ import torch.distributed as dist
40
+ import torch.nn
41
+ import torch.nn as nn
42
+ import torch.utils.data.distributed
43
+ from PIL import Image
44
+ from torchvision.transforms import ToTensor
45
+
46
+
47
+ class RunningAverage:
48
+ def __init__(self):
49
+ self.avg = 0
50
+ self.count = 0
51
+
52
+ def append(self, value):
53
+ self.avg = (value + self.count * self.avg) / (self.count + 1)
54
+ self.count += 1
55
+
56
+ def get_value(self):
57
+ return self.avg
58
+
59
+
60
+ def denormalize(x):
61
+ """Reverses the imagenet normalization applied to the input.
62
+
63
+ Args:
64
+ x (torch.Tensor - shape(N,3,H,W)): input tensor
65
+
66
+ Returns:
67
+ torch.Tensor - shape(N,3,H,W): Denormalized input
68
+ """
69
+ mean = torch.Tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1).to(x.device)
70
+ std = torch.Tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1).to(x.device)
71
+ return x * std + mean
72
+
73
+
74
+ class RunningAverageDict:
75
+ """A dictionary of running averages."""
76
+ def __init__(self):
77
+ self._dict = None
78
+
79
+ def update(self, new_dict):
80
+ if new_dict is None:
81
+ return
82
+
83
+ if self._dict is None:
84
+ self._dict = dict()
85
+ for key, value in new_dict.items():
86
+ self._dict[key] = RunningAverage()
87
+
88
+ for key, value in new_dict.items():
89
+ self._dict[key].append(value)
90
+
91
+ def get_value(self):
92
+ if self._dict is None:
93
+ return None
94
+ return {key: value.get_value() for key, value in self._dict.items()}
95
+
96
+
97
+ def colorize(value, vmin=None, vmax=None, cmap='gray_r', invalid_val=-99, invalid_mask=None, background_color=(128, 128, 128, 255), gamma_corrected=False, value_transform=None):
98
+ """Converts a depth map to a color image.
99
+
100
+ Args:
101
+ value (torch.Tensor, numpy.ndarry): Input depth map. Shape: (H, W) or (1, H, W) or (1, 1, H, W). All singular dimensions are squeezed
102
+ vmin (float, optional): vmin-valued entries are mapped to start color of cmap. If None, value.min() is used. Defaults to None.
103
+ vmax (float, optional): vmax-valued entries are mapped to end color of cmap. If None, value.max() is used. Defaults to None.
104
+ cmap (str, optional): matplotlib colormap to use. Defaults to 'magma_r'.
105
+ invalid_val (int, optional): Specifies value of invalid pixels that should be colored as 'background_color'. Defaults to -99.
106
+ invalid_mask (numpy.ndarray, optional): Boolean mask for invalid regions. Defaults to None.
107
+ background_color (tuple[int], optional): 4-tuple RGB color to give to invalid pixels. Defaults to (128, 128, 128, 255).
108
+ gamma_corrected (bool, optional): Apply gamma correction to colored image. Defaults to False.
109
+ value_transform (Callable, optional): Apply transform function to valid pixels before coloring. Defaults to None.
110
+
111
+ Returns:
112
+ numpy.ndarray, dtype - uint8: Colored depth map. Shape: (H, W, 4)
113
+ """
114
+ if isinstance(value, torch.Tensor):
115
+ value = value.detach().cpu().numpy()
116
+
117
+ value = value.squeeze()
118
+ if invalid_mask is None:
119
+ invalid_mask = value == invalid_val
120
+ mask = np.logical_not(invalid_mask)
121
+
122
+ # normalize
123
+ vmin = np.percentile(value[mask],2) if vmin is None else vmin
124
+ vmax = np.percentile(value[mask],85) if vmax is None else vmax
125
+ if vmin != vmax:
126
+ value = (value - vmin) / (vmax - vmin) # vmin..vmax
127
+ else:
128
+ # Avoid 0-division
129
+ value = value * 0.
130
+
131
+ # squeeze last dim if it exists
132
+ # grey out the invalid values
133
+
134
+ value[invalid_mask] = np.nan
135
+ cmapper = matplotlib.cm.get_cmap(cmap)
136
+ if value_transform:
137
+ value = value_transform(value)
138
+ # value = value / value.max()
139
+ value = cmapper(value, bytes=True) # (nxmx4)
140
+
141
+ # img = value[:, :, :]
142
+ img = value[...]
143
+ img[invalid_mask] = background_color
144
+
145
+ # return img.transpose((2, 0, 1))
146
+ if gamma_corrected:
147
+ # gamma correction
148
+ img = img / 255
149
+ img = np.power(img, 2.2)
150
+ img = img * 255
151
+ img = img.astype(np.uint8)
152
+ return img
153
+
154
+
155
+ def count_parameters(model, include_all=False):
156
+ return sum(p.numel() for p in model.parameters() if p.requires_grad or include_all)
157
+
158
+
159
+ def compute_errors(gt, pred):
160
+ """Compute metrics for 'pred' compared to 'gt'
161
+
162
+ Args:
163
+ gt (numpy.ndarray): Ground truth values
164
+ pred (numpy.ndarray): Predicted values
165
+
166
+ gt.shape should be equal to pred.shape
167
+
168
+ Returns:
169
+ dict: Dictionary containing the following metrics:
170
+ 'a1': Delta1 accuracy: Fraction of pixels that are within a scale factor of 1.25
171
+ 'a2': Delta2 accuracy: Fraction of pixels that are within a scale factor of 1.25^2
172
+ 'a3': Delta3 accuracy: Fraction of pixels that are within a scale factor of 1.25^3
173
+ 'abs_rel': Absolute relative error
174
+ 'rmse': Root mean squared error
175
+ 'log_10': Absolute log10 error
176
+ 'sq_rel': Squared relative error
177
+ 'rmse_log': Root mean squared error on the log scale
178
+ 'silog': Scale invariant log error
179
+ """
180
+ thresh = np.maximum((gt / pred), (pred / gt))
181
+ a1 = (thresh < 1.25).mean()
182
+ a2 = (thresh < 1.25 ** 2).mean()
183
+ a3 = (thresh < 1.25 ** 3).mean()
184
+
185
+ abs_rel = np.mean(np.abs(gt - pred) / gt)
186
+ sq_rel = np.mean(((gt - pred) ** 2) / gt)
187
+
188
+ rmse = (gt - pred) ** 2
189
+ rmse = np.sqrt(rmse.mean())
190
+
191
+ rmse_log = (np.log(gt) - np.log(pred)) ** 2
192
+ rmse_log = np.sqrt(rmse_log.mean())
193
+
194
+ err = np.log(pred) - np.log(gt)
195
+ silog = np.sqrt(np.mean(err ** 2) - np.mean(err) ** 2) * 100
196
+
197
+ log_10 = (np.abs(np.log10(gt) - np.log10(pred))).mean()
198
+ return dict(a1=a1, a2=a2, a3=a3, abs_rel=abs_rel, rmse=rmse, log_10=log_10, rmse_log=rmse_log,
199
+ silog=silog, sq_rel=sq_rel)
200
+
201
+
202
+ def compute_metrics(gt, pred, interpolate=True, garg_crop=False, eigen_crop=True, dataset='nyu', min_depth_eval=0.1, max_depth_eval=10, **kwargs):
203
+ """Compute metrics of predicted depth maps. Applies cropping and masking as necessary or specified via arguments. Refer to compute_errors for more details on metrics.
204
+ """
205
+ if 'config' in kwargs:
206
+ config = kwargs['config']
207
+ garg_crop = config.garg_crop
208
+ eigen_crop = config.eigen_crop
209
+ min_depth_eval = config.min_depth_eval
210
+ max_depth_eval = config.max_depth_eval
211
+
212
+ if gt.shape[-2:] != pred.shape[-2:] and interpolate:
213
+ pred = nn.functional.interpolate(
214
+ pred, gt.shape[-2:], mode='bilinear', align_corners=True)
215
+
216
+ pred = pred.squeeze().cpu().numpy()
217
+ pred[pred < min_depth_eval] = min_depth_eval
218
+ pred[pred > max_depth_eval] = max_depth_eval
219
+ pred[np.isinf(pred)] = max_depth_eval
220
+ pred[np.isnan(pred)] = min_depth_eval
221
+
222
+ gt_depth = gt.squeeze().cpu().numpy()
223
+ valid_mask = np.logical_and(
224
+ gt_depth > min_depth_eval, gt_depth < max_depth_eval)
225
+
226
+ if garg_crop or eigen_crop:
227
+ gt_height, gt_width = gt_depth.shape
228
+ eval_mask = np.zeros(valid_mask.shape)
229
+
230
+ if garg_crop:
231
+ eval_mask[int(0.40810811 * gt_height):int(0.99189189 * gt_height),
232
+ int(0.03594771 * gt_width):int(0.96405229 * gt_width)] = 1
233
+
234
+ elif eigen_crop:
235
+ # print("-"*10, " EIGEN CROP ", "-"*10)
236
+ if dataset == 'kitti':
237
+ eval_mask[int(0.3324324 * gt_height):int(0.91351351 * gt_height),
238
+ int(0.0359477 * gt_width):int(0.96405229 * gt_width)] = 1
239
+ else:
240
+ # assert gt_depth.shape == (480, 640), "Error: Eigen crop is currently only valid for (480, 640) images"
241
+ eval_mask[45:471, 41:601] = 1
242
+ else:
243
+ eval_mask = np.ones(valid_mask.shape)
244
+ valid_mask = np.logical_and(valid_mask, eval_mask)
245
+ return compute_errors(gt_depth[valid_mask], pred[valid_mask])
246
+
247
+
248
+ #################################### Model uilts ################################################
249
+
250
+
251
+ def parallelize(config, model, find_unused_parameters=True):
252
+
253
+ if config.gpu is not None:
254
+ torch.cuda.set_device(config.gpu)
255
+ model = model.cuda(config.gpu)
256
+
257
+ config.multigpu = False
258
+ if config.distributed:
259
+ # Use DDP
260
+ config.multigpu = True
261
+ config.rank = config.rank * config.ngpus_per_node + config.gpu
262
+ dist.init_process_group(backend=config.dist_backend, init_method=config.dist_url,
263
+ world_size=config.world_size, rank=config.rank)
264
+ config.batch_size = int(config.batch_size / config.ngpus_per_node)
265
+ # config.batch_size = 8
266
+ config.workers = int(
267
+ (config.num_workers + config.ngpus_per_node - 1) / config.ngpus_per_node)
268
+ print("Device", config.gpu, "Rank", config.rank, "batch size",
269
+ config.batch_size, "Workers", config.workers)
270
+ torch.cuda.set_device(config.gpu)
271
+ model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
272
+ model = model.cuda(config.gpu)
273
+ model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[config.gpu], output_device=config.gpu,
274
+ find_unused_parameters=find_unused_parameters)
275
+
276
+ elif config.gpu is None:
277
+ # Use DP
278
+ config.multigpu = True
279
+ model = model.cuda()
280
+ model = torch.nn.DataParallel(model)
281
+
282
+ return model
283
+
284
+
285
+ #################################################################################################
286
+
287
+
288
+ #####################################################################################################
289
+
290
+
291
+ class colors:
292
+ '''Colors class:
293
+ Reset all colors with colors.reset
294
+ Two subclasses fg for foreground and bg for background.
295
+ Use as colors.subclass.colorname.
296
+ i.e. colors.fg.red or colors.bg.green
297
+ Also, the generic bold, disable, underline, reverse, strikethrough,
298
+ and invisible work with the main class
299
+ i.e. colors.bold
300
+ '''
301
+ reset = '\033[0m'
302
+ bold = '\033[01m'
303
+ disable = '\033[02m'
304
+ underline = '\033[04m'
305
+ reverse = '\033[07m'
306
+ strikethrough = '\033[09m'
307
+ invisible = '\033[08m'
308
+
309
+ class fg:
310
+ black = '\033[30m'
311
+ red = '\033[31m'
312
+ green = '\033[32m'
313
+ orange = '\033[33m'
314
+ blue = '\033[34m'
315
+ purple = '\033[35m'
316
+ cyan = '\033[36m'
317
+ lightgrey = '\033[37m'
318
+ darkgrey = '\033[90m'
319
+ lightred = '\033[91m'
320
+ lightgreen = '\033[92m'
321
+ yellow = '\033[93m'
322
+ lightblue = '\033[94m'
323
+ pink = '\033[95m'
324
+ lightcyan = '\033[96m'
325
+
326
+ class bg:
327
+ black = '\033[40m'
328
+ red = '\033[41m'
329
+ green = '\033[42m'
330
+ orange = '\033[43m'
331
+ blue = '\033[44m'
332
+ purple = '\033[45m'
333
+ cyan = '\033[46m'
334
+ lightgrey = '\033[47m'
335
+
336
+
337
+ def printc(text, color):
338
+ print(f"{color}{text}{colors.reset}")
339
+
340
+ ############################################
341
+
342
+ def get_image_from_url(url):
343
+ response = requests.get(url)
344
+ img = Image.open(BytesIO(response.content)).convert("RGB")
345
+ return img
346
+
347
+ def url_to_torch(url, size=(384, 384)):
348
+ img = get_image_from_url(url)
349
+ img = img.resize(size, Image.ANTIALIAS)
350
+ img = torch.from_numpy(np.asarray(img)).float()
351
+ img = img.permute(2, 0, 1)
352
+ img.div_(255)
353
+ return img
354
+
355
+ def pil_to_batched_tensor(img):
356
+ return ToTensor()(img).unsqueeze(0)
357
+
358
+ def save_raw_16bit(depth, fpath="raw.png"):
359
+ if isinstance(depth, torch.Tensor):
360
+ depth = depth.squeeze().cpu().numpy()
361
+
362
+ assert isinstance(depth, np.ndarray), "Depth must be a torch tensor or numpy array"
363
+ assert depth.ndim == 2, "Depth must be 2D"
364
+ depth = depth * 256 # scale for 16-bit png
365
+ depth = depth.astype(np.uint16)
366
+ depth = Image.fromarray(depth)
367
+ depth.save(fpath)
368
+ print("Saved raw depth to", fpath)
extensions/microsoftexcel-controlnet/example/api_img2img.ipynb ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "# controlnet + img2img\n",
10
+ "# enable `Allow other script to control this extension` in settings\n",
11
+ "\n",
12
+ "import requests\n",
13
+ "import cv2\n",
14
+ "from base64 import b64encode\n",
15
+ "\n",
16
+ "def readImage(path):\n",
17
+ " img = cv2.imread(path)\n",
18
+ " retval, buffer = cv2.imencode('.jpg', img)\n",
19
+ " b64img = b64encode(buffer).decode(\"utf-8\")\n",
20
+ " return b64img\n",
21
+ "\n",
22
+ "b64img = readImage(\"/root/workspace/nahida/0e17302b9bfa15402f783c29c0d1d34f.jpg\")\n",
23
+ "\n",
24
+ "class controlnetRequest():\n",
25
+ " def __init__(self, prompt):\n",
26
+ " self.url = \"http://localhost:7860/controlnet/img2img\"\n",
27
+ " self.body = {\n",
28
+ " \"init_images\": [b64img],\n",
29
+ " \"prompt\": prompt,\n",
30
+ " \"negative_prompt\": \"\",\n",
31
+ " \"seed\": -1,\n",
32
+ " \"subseed\": -1,\n",
33
+ " \"subseed_strength\": 0,\n",
34
+ " \"batch_size\": 1,\n",
35
+ " \"n_iter\": 1,\n",
36
+ " \"steps\": 20,\n",
37
+ " \"cfg_scale\": 7,\n",
38
+ " \"width\": 512,\n",
39
+ " \"height\": 768,\n",
40
+ " \"restore_faces\": True,\n",
41
+ " \"eta\": 0,\n",
42
+ " \"sampler_index\": \"Euler a\",\n",
43
+ " \"controlnet_input_image\": [b64img],\n",
44
+ " \"controlnet_module\": 'canny',\n",
45
+ " \"controlnet_model\": 'control_canny-fp16 [e3fe7712]',\n",
46
+ " \"controlnet_guidance\": 1.0,\n",
47
+ " }\n",
48
+ "\n",
49
+ " def sendRequest(self):\n",
50
+ " r = requests.post(self.url, json=self.body)\n",
51
+ " return r.json()\n",
52
+ "\n",
53
+ "js = controlnetRequest(\"walter white\").sendRequest()"
54
+ ]
55
+ },
56
+ {
57
+ "cell_type": "code",
58
+ "execution_count": null,
59
+ "metadata": {},
60
+ "outputs": [],
61
+ "source": [
62
+ "import io, base64\n",
63
+ "import matplotlib.pyplot as plt\n",
64
+ "from PIL import Image\n",
65
+ "\n",
66
+ "pil_img = Image.open('/root/workspace/nahida/0e17302b9bfa15402f783c29c0d1d34f.jpg')\n",
67
+ "image = Image.open(io.BytesIO(base64.b64decode(js[\"images\"][0])))\n",
68
+ "mask_image = Image.open(io.BytesIO(base64.b64decode(js[\"images\"][1])))\n",
69
+ "\n",
70
+ "plt.figure()\n",
71
+ "f, axarr = plt.subplots(1,3) \n",
72
+ "axarr[0].imshow(pil_img) \n",
73
+ "axarr[1].imshow(image) \n",
74
+ "axarr[2].imshow(mask_image) "
75
+ ]
76
+ }
77
+ ],
78
+ "metadata": {
79
+ "kernelspec": {
80
+ "display_name": "pynb",
81
+ "language": "python",
82
+ "name": "python3"
83
+ },
84
+ "language_info": {
85
+ "codemirror_mode": {
86
+ "name": "ipython",
87
+ "version": 3
88
+ },
89
+ "file_extension": ".py",
90
+ "mimetype": "text/x-python",
91
+ "name": "python",
92
+ "nbconvert_exporter": "python",
93
+ "pygments_lexer": "ipython3",
94
+ "version": "3.10.9"
95
+ },
96
+ "orig_nbformat": 4,
97
+ "vscode": {
98
+ "interpreter": {
99
+ "hash": "d73345514d8c18d9a1da7351d222dbd2834c7f4a09e728a0d1f4c4580fbec206"
100
+ }
101
+ }
102
+ },
103
+ "nbformat": 4,
104
+ "nbformat_minor": 2
105
+ }
extensions/microsoftexcel-controlnet/example/api_txt2img.ipynb ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "# controlnet + txt2img\n",
10
+ "# enable `Allow other script to control this extension` in settings\n",
11
+ "\n",
12
+ "import requests\n",
13
+ "import cv2\n",
14
+ "from base64 import b64encode\n",
15
+ "\n",
16
+ "def readImage(path):\n",
17
+ " img = cv2.imread(path)\n",
18
+ " retval, buffer = cv2.imencode('.jpg', img)\n",
19
+ " b64img = b64encode(buffer).decode(\"utf-8\")\n",
20
+ " return b64img\n",
21
+ "\n",
22
+ "b64img = readImage(\"/root/workspace/nahida/0e17302b9bfa15402f783c29c0d1d34f.jpg\")\n",
23
+ "\n",
24
+ "class controlnetRequest():\n",
25
+ " def __init__(self, prompt):\n",
26
+ " self.url = \"http://localhost:7860/controlnet/txt2img\"\n",
27
+ " self.body = {\n",
28
+ " \"prompt\": prompt,\n",
29
+ " \"negative_prompt\": \"\",\n",
30
+ " \"seed\": -1,\n",
31
+ " \"subseed\": -1,\n",
32
+ " \"subseed_strength\": 0,\n",
33
+ " \"batch_size\": 1,\n",
34
+ " \"n_iter\": 1,\n",
35
+ " \"steps\": 15,\n",
36
+ " \"cfg_scale\": 7,\n",
37
+ " \"width\": 512,\n",
38
+ " \"height\": 768,\n",
39
+ " \"restore_faces\": True,\n",
40
+ " \"eta\": 0,\n",
41
+ " \"sampler_index\": \"Euler a\",\n",
42
+ " \"controlnet_input_image\": [b64img],\n",
43
+ " \"controlnet_module\": 'canny',\n",
44
+ " \"controlnet_model\": 'control_canny-fp16 [e3fe7712]',\n",
45
+ " \"controlnet_guidance\": 1.0,\n",
46
+ " }\n",
47
+ "\n",
48
+ " def sendRequest(self):\n",
49
+ " r = requests.post(self.url, json=self.body)\n",
50
+ " return r.json()\n",
51
+ "\n",
52
+ "js = controlnetRequest(\"walter white\").sendRequest()"
53
+ ]
54
+ },
55
+ {
56
+ "cell_type": "code",
57
+ "execution_count": null,
58
+ "metadata": {},
59
+ "outputs": [],
60
+ "source": [
61
+ "import io, base64\n",
62
+ "import matplotlib.pyplot as plt\n",
63
+ "from PIL import Image\n",
64
+ "\n",
65
+ "pil_img = Image.open('/root/workspace/nahida/0e17302b9bfa15402f783c29c0d1d34f.jpg')\n",
66
+ "image = Image.open(io.BytesIO(base64.b64decode(js[\"images\"][0])))\n",
67
+ "mask_image = Image.open(io.BytesIO(base64.b64decode(js[\"images\"][1])))\n",
68
+ "\n",
69
+ "plt.figure()\n",
70
+ "f, axarr = plt.subplots(1,3) \n",
71
+ "axarr[0].imshow(pil_img) \n",
72
+ "axarr[1].imshow(image) \n",
73
+ "axarr[2].imshow(mask_image) "
74
+ ]
75
+ }
76
+ ],
77
+ "metadata": {
78
+ "kernelspec": {
79
+ "display_name": "pynb",
80
+ "language": "python",
81
+ "name": "python3"
82
+ },
83
+ "language_info": {
84
+ "codemirror_mode": {
85
+ "name": "ipython",
86
+ "version": 3
87
+ },
88
+ "file_extension": ".py",
89
+ "mimetype": "text/x-python",
90
+ "name": "python",
91
+ "nbconvert_exporter": "python",
92
+ "pygments_lexer": "ipython3",
93
+ "version": "3.10.9"
94
+ },
95
+ "orig_nbformat": 4,
96
+ "vscode": {
97
+ "interpreter": {
98
+ "hash": "d73345514d8c18d9a1da7351d222dbd2834c7f4a09e728a0d1f4c4580fbec206"
99
+ }
100
+ }
101
+ },
102
+ "nbformat": 4,
103
+ "nbformat_minor": 2
104
+ }
extensions/microsoftexcel-controlnet/example/chatgpt.py ADDED
@@ -0,0 +1,676 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import uuid
4
+ import cv2
5
+ import torch
6
+ import requests
7
+ import io, base64
8
+ import numpy as np
9
+ import gradio as gr
10
+ from PIL import Image
11
+ from base64 import b64encode
12
+ from omegaconf import OmegaConf
13
+ from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration, BlipForQuestionAnswering
14
+ from transformers import AutoModelForCausalLM, AutoTokenizer, CLIPSegProcessor, CLIPSegForImageSegmentation
15
+
16
+ from langchain.agents.initialize import initialize_agent
17
+ from langchain.agents.tools import Tool
18
+ from langchain.chains.conversation.memory import ConversationBufferMemory
19
+ from langchain.llms.openai import OpenAI
20
+
21
+ VISUAL_CHATGPT_PREFIX = """Visual ChatGPT is designed to be able to assist with a wide range of text and visual related tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of topics. Visual ChatGPT is able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.
22
+ Visual ChatGPT is able to process and understand large amounts of text and images. As a language model, Visual ChatGPT can not directly read images, but it has a list of tools to finish different visual tasks. Each image will have a file name formed as "image/xxx.png", and Visual ChatGPT can invoke different tools to indirectly understand pictures. When talking about images, Visual ChatGPT is very strict to the file name and will never fabricate nonexistent files. When using tools to generate new image files, Visual ChatGPT is also known that the image may not be the same as the user's demand, and will use other visual question answering tools or description tools to observe the real image. Visual ChatGPT is able to use tools in a sequence, and is loyal to the tool observation outputs rather than faking the image content and image file name. It will remember to provide the file name from the last tool observation, if a new image is generated.
23
+ Human may provide new figures to Visual ChatGPT with a description. The description helps Visual ChatGPT to understand this image, but Visual ChatGPT should use tools to finish following tasks, rather than directly imagine from the description.
24
+ Overall, Visual ChatGPT is a powerful visual dialogue assistant tool that can help with a wide range of tasks and provide valuable insights and information on a wide range of topics.
25
+ TOOLS:
26
+ ------
27
+ Visual ChatGPT has access to the following tools:"""
28
+
29
+ VISUAL_CHATGPT_FORMAT_INSTRUCTIONS = """To use a tool, please use the following format:
30
+ ```
31
+ Thought: Do I need to use a tool? Yes
32
+ Action: the action to take, should be one of [{tool_names}]
33
+ Action Input: the input to the action
34
+ Observation: the result of the action
35
+ ```
36
+ When you have a response to say to the Human, or if you do not need to use a tool, you MUST use the format:
37
+ ```
38
+ Thought: Do I need to use a tool? No
39
+ {ai_prefix}: [your response here]
40
+ ```
41
+ """
42
+
43
+ VISUAL_CHATGPT_SUFFIX = """You are very strict to the filename correctness and will never fake a file name if it does not exist.
44
+ You will remember to provide the image file name loyally if it's provided in the last tool observation.
45
+ Begin!
46
+ Previous conversation history:
47
+ {chat_history}
48
+ New input: {input}
49
+ Since Visual ChatGPT is a text language model, Visual ChatGPT must use tools to observe images rather than imagination.
50
+ The thoughts and observations are only visible for Visual ChatGPT, Visual ChatGPT should remember to repeat important information in the final response for Human.
51
+ Thought: Do I need to use a tool? {agent_scratchpad}"""
52
+
53
+ ENDPOINT = "http://localhost:7860"
54
+ T2IAPI = ENDPOINT + "/controlnet/txt2img"
55
+ DETECTAPI = ENDPOINT + "/controlnet/detect"
56
+ MODELLIST = ENDPOINT + "/controlnet/model_list"
57
+
58
+ device = "cpu"
59
+ if torch.cuda.is_available():
60
+ device = "cuda"
61
+
62
+ def readImage(path):
63
+ img = cv2.imread(path)
64
+ retval, buffer = cv2.imencode('.jpg', img)
65
+ b64img = b64encode(buffer).decode("utf-8")
66
+ return b64img
67
+
68
+ def get_model(pattern='^control_canny.*'):
69
+ r = requests.get(MODELLIST)
70
+ result = r.json()["model_list"]
71
+ for item in result:
72
+ if re.match(pattern, item):
73
+ return item
74
+
75
+ def do_webui_request(url=T2IAPI, **kwargs):
76
+ reqbody = {
77
+ "prompt": "best quality, extremely detailed",
78
+ "negative_prompt": "longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
79
+ "seed": -1,
80
+ "subseed": -1,
81
+ "subseed_strength": 0,
82
+ "batch_size": 1,
83
+ "n_iter": 1,
84
+ "steps": 15,
85
+ "cfg_scale": 7,
86
+ "width": 512,
87
+ "height": 768,
88
+ "restore_faces": True,
89
+ "eta": 0,
90
+ "sampler_index": "Euler a",
91
+ "controlnet_input_images": [],
92
+ "controlnet_module": 'canny',
93
+ "controlnet_model": 'control_canny-fp16 [e3fe7712]',
94
+ "controlnet_guidance": 1.0,
95
+ }
96
+ reqbody.update(kwargs)
97
+ r = requests.post(url, json=reqbody)
98
+ return r.json()
99
+
100
+
101
+ def cut_dialogue_history(history_memory, keep_last_n_words=500):
102
+ tokens = history_memory.split()
103
+ n_tokens = len(tokens)
104
+ print(f"hitory_memory:{history_memory}, n_tokens: {n_tokens}")
105
+ if n_tokens < keep_last_n_words:
106
+ return history_memory
107
+ else:
108
+ paragraphs = history_memory.split('\n')
109
+ last_n_tokens = n_tokens
110
+ while last_n_tokens >= keep_last_n_words:
111
+ last_n_tokens = last_n_tokens - len(paragraphs[0].split(' '))
112
+ paragraphs = paragraphs[1:]
113
+ return '\n' + '\n'.join(paragraphs)
114
+
115
+ def get_new_image_name(org_img_name, func_name="update"):
116
+ head_tail = os.path.split(org_img_name)
117
+ head = head_tail[0]
118
+ tail = head_tail[1]
119
+ name_split = tail.split('.')[0].split('_')
120
+ this_new_uuid = str(uuid.uuid4())[0:4]
121
+ if len(name_split) == 1:
122
+ most_org_file_name = name_split[0]
123
+ recent_prev_file_name = name_split[0]
124
+ new_file_name = '{}_{}_{}_{}.png'.format(this_new_uuid, func_name, recent_prev_file_name, most_org_file_name)
125
+ else:
126
+ assert len(name_split) == 4
127
+ most_org_file_name = name_split[3]
128
+ recent_prev_file_name = name_split[0]
129
+ new_file_name = '{}_{}_{}_{}.png'.format(this_new_uuid, func_name, recent_prev_file_name, most_org_file_name)
130
+ return os.path.join(head, new_file_name)
131
+
132
+ class MaskFormer:
133
+ def __init__(self, device):
134
+ self.device = device
135
+ self.processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
136
+ self.model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined").to(device)
137
+
138
+ def inference(self, image_path, text):
139
+ threshold = 0.5
140
+ min_area = 0.02
141
+ padding = 20
142
+ original_image = Image.open(image_path)
143
+ image = original_image.resize((512, 512))
144
+ inputs = self.processor(text=text, images=image, padding="max_length", return_tensors="pt",).to(self.device)
145
+ with torch.no_grad():
146
+ outputs = self.model(**inputs)
147
+ mask = torch.sigmoid(outputs[0]).squeeze().cpu().numpy() > threshold
148
+ area_ratio = len(np.argwhere(mask)) / (mask.shape[0] * mask.shape[1])
149
+ if area_ratio < min_area:
150
+ return None
151
+ true_indices = np.argwhere(mask)
152
+ mask_array = np.zeros_like(mask, dtype=bool)
153
+ for idx in true_indices:
154
+ padded_slice = tuple(slice(max(0, i - padding), i + padding + 1) for i in idx)
155
+ mask_array[padded_slice] = True
156
+ visual_mask = (mask_array * 255).astype(np.uint8)
157
+ image_mask = Image.fromarray(visual_mask)
158
+ return image_mask.resize(image.size)
159
+
160
+ # class ImageEditing:
161
+ # def __init__(self, device):
162
+ # print("Initializing StableDiffusionInpaint to %s" % device)
163
+ # self.device = device
164
+ # self.mask_former = MaskFormer(device=self.device)
165
+ # # self.inpainting = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting",).to(device)
166
+
167
+ # def remove_part_of_image(self, input):
168
+ # image_path, to_be_removed_txt = input.split(",")
169
+ # print(f'remove_part_of_image: to_be_removed {to_be_removed_txt}')
170
+ # return self.replace_part_of_image(f"{image_path},{to_be_removed_txt},background")
171
+
172
+ # def replace_part_of_image(self, input):
173
+ # image_path, to_be_replaced_txt, replace_with_txt = input.split(",")
174
+ # print(f'replace_part_of_image: replace_with_txt {replace_with_txt}')
175
+ # mask_image = self.mask_former.inference(image_path, to_be_replaced_txt)
176
+ # buffered = io.BytesIO()
177
+ # mask_image.save(buffered, format="JPEG")
178
+ # resp = do_webui_request(
179
+ # url=ENDPOINT + "/sdapi/v1/img2img",
180
+ # init_images=[readImage(image_path)],
181
+ # mask=b64encode(buffered.getvalue()).decode("utf-8"),
182
+ # prompt=replace_with_txt,
183
+ # )
184
+ # image = Image.open(io.BytesIO(base64.b64decode(resp["images"][0])))
185
+ # updated_image_path = get_new_image_name(image_path, func_name="replace-something")
186
+ # updated_image.save(updated_image_path)
187
+ # return updated_image_path
188
+
189
+ # class Pix2Pix:
190
+ # def __init__(self, device):
191
+ # print("Initializing Pix2Pix to %s" % device)
192
+ # self.device = device
193
+ # self.pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained("timbrooks/instruct-pix2pix", torch_dtype=torch.float16, safety_checker=None).to(device)
194
+ # self.pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(self.pipe.scheduler.config)
195
+
196
+ # def inference(self, inputs):
197
+ # """Change style of image."""
198
+ # print("===>Starting Pix2Pix Inference")
199
+ # image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
200
+ # original_image = Image.open(image_path)
201
+ # image = self.pipe(instruct_text,image=original_image,num_inference_steps=40,image_guidance_scale=1.2,).images[0]
202
+ # updated_image_path = get_new_image_name(image_path, func_name="pix2pix")
203
+ # image.save(updated_image_path)
204
+ # return updated_image_path
205
+
206
+
207
+ class T2I:
208
+ def __init__(self, device):
209
+ print("Initializing T2I to %s" % device)
210
+ self.device = device
211
+ self.text_refine_tokenizer = AutoTokenizer.from_pretrained("Gustavosta/MagicPrompt-Stable-Diffusion")
212
+ self.text_refine_model = AutoModelForCausalLM.from_pretrained("Gustavosta/MagicPrompt-Stable-Diffusion")
213
+ self.text_refine_gpt2_pipe = pipeline("text-generation", model=self.text_refine_model, tokenizer=self.text_refine_tokenizer, device=self.device)
214
+
215
+ def inference(self, text):
216
+ image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
217
+ refined_text = self.text_refine_gpt2_pipe(text)[0]["generated_text"]
218
+ print(f'{text} refined to {refined_text}')
219
+ resp = do_webui_request(
220
+ url=ENDPOINT + "/sdapi/v1/txt2img",
221
+ prompt=refined_text,
222
+ )
223
+ image = Image.open(io.BytesIO(base64.b64decode(resp["images"][0])))
224
+ image.save(image_filename)
225
+ print(f"Processed T2I.run, text: {text}, image_filename: {image_filename}")
226
+ return image_filename
227
+
228
+
229
+ class ImageCaptioning:
230
+ def __init__(self, device):
231
+ print("Initializing ImageCaptioning to %s" % device)
232
+ self.device = device
233
+ self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
234
+ self.model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(self.device)
235
+
236
+ def inference(self, image_path):
237
+ inputs = self.processor(Image.open(image_path), return_tensors="pt").to(self.device)
238
+ out = self.model.generate(**inputs)
239
+ captions = self.processor.decode(out[0], skip_special_tokens=True)
240
+ return captions
241
+
242
+
243
+ class image2canny:
244
+ def inference(self, inputs):
245
+ print("===>Starting image2canny Inference")
246
+ resp = do_webui_request(
247
+ url=DETECTAPI,
248
+ controlnet_input_images=[readImage(inputs)],
249
+ controlnet_module="segmentation",
250
+ )
251
+ updated_image_path = get_new_image_name(inputs, func_name="edge")
252
+ image.save(updated_image_path)
253
+ return updated_image_path
254
+
255
+
256
+ class canny2image:
257
+ def inference(self, inputs):
258
+ print("===>Starting canny2image Inference")
259
+ image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
260
+ resp = do_webui_request(
261
+ prompt=instruct_text,
262
+ controlnet_input_images=[readImage(image_path)],
263
+ controlnet_module="none",
264
+ controlnet_model=get_model(pattern='^control_canny.*'),
265
+ )
266
+ image = Image.open(io.BytesIO(base64.b64decode(resp["images"][0])))
267
+ updated_image_path = get_new_image_name(image_path, func_name="canny2image")
268
+ real_image = Image.fromarray(x_samples[0])
269
+ real_image.save(updated_image_path)
270
+ return updated_image_path
271
+
272
+
273
+ class image2line:
274
+ def inference(self, inputs):
275
+ print("===>Starting image2hough Inference")
276
+ resp = do_webui_request(
277
+ url=DETECTAPI,
278
+ controlnet_input_images=[readImage(inputs)],
279
+ controlnet_module="mlsd",
280
+ )
281
+ updated_image_path = get_new_image_name(inputs, func_name="line-of")
282
+ image = Image.open(io.BytesIO(base64.b64decode(resp["images"][0])))
283
+ image.save(updated_image_path)
284
+ return updated_image_path
285
+
286
+
287
+ class line2image:
288
+ def inference(self, inputs):
289
+ print("===>Starting line2image Inference")
290
+ image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
291
+ resp = do_webui_request(
292
+ prompt=instruct_text,
293
+ controlnet_input_images=[readImage(image_path)],
294
+ controlnet_module="none",
295
+ controlnet_model=get_model(pattern='^control_mlsd.*'),
296
+ )
297
+ image = Image.open(io.BytesIO(base64.b64decode(resp["images"][0])))
298
+ updated_image_path = get_new_image_name(image_path, func_name="line2image")
299
+ real_image = Image.fromarray(x_samples[0]) # default the index0 image
300
+ real_image.save(updated_image_path)
301
+ return updated_image_path
302
+
303
+
304
+ class image2hed:
305
+ def inference(self, inputs):
306
+ print("===>Starting image2hed Inference")
307
+ resp = do_webui_request(
308
+ url=DETECTAPI,
309
+ controlnet_input_images=[readImage(inputs)],
310
+ controlnet_module="hed",
311
+ )
312
+ image = Image.open(io.BytesIO(base64.b64decode(resp["images"][0])))
313
+ updated_image_path = get_new_image_name(inputs, func_name="hed-boundary")
314
+ image.save(updated_image_path)
315
+ return updated_image_path
316
+
317
+
318
+ class hed2image:
319
+ def inference(self, inputs):
320
+ print("===>Starting hed2image Inference")
321
+ resp = do_webui_request(
322
+ prompt=instruct_text,
323
+ controlnet_input_images=[readImage(image_path)],
324
+ controlnet_module="none",
325
+ controlnet_model=get_model(pattern='^control_hed.*'),
326
+ )
327
+ image = Image.open(io.BytesIO(base64.b64decode(resp["images"][0])))
328
+ updated_image_path = get_new_image_name(image_path, func_name="hed2image")
329
+ real_image = Image.fromarray(x_samples[0]) # default the index0 image
330
+ real_image.save(updated_image_path)
331
+ return updated_image_path
332
+
333
+
334
+ class image2scribble:
335
+ def inference(self, inputs):
336
+ print("===>Starting image2scribble Inference")
337
+ resp = do_webui_request(
338
+ url=DETECTAPI,
339
+ controlnet_input_images=[readImage(inputs)],
340
+ controlnet_module="scribble",
341
+ )
342
+ image = Image.open(io.BytesIO(base64.b64decode(resp["images"][0])))
343
+ updated_image_path = get_new_image_name(inputs, func_name="scribble")
344
+ image.save(updated_image_path)
345
+ return updated_image_path
346
+
347
+
348
+ class scribble2image:
349
+ def inference(self, inputs):
350
+ print("===>Starting seg2image Inference")
351
+ image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
352
+ resp = do_webui_request(
353
+ prompt=instruct_text,
354
+ controlnet_input_images=[readImage(image_path)],
355
+ controlnet_module="none",
356
+ controlnet_model=get_model(pattern='^control_scribble.*'),
357
+ )
358
+ image = Image.open(io.BytesIO(base64.b64decode(resp["images"][0])))
359
+ updated_image_path = get_new_image_name(image_path, func_name="scribble2image")
360
+ real_image = Image.fromarray(x_samples[0])
361
+ real_image.save(updated_image_path)
362
+ return updated_image_path
363
+
364
+
365
+ class image2pose:
366
+ def inference(self, inputs):
367
+ print("===>Starting image2pose Inference")
368
+ resp = do_webui_request(
369
+ url=DETECTAPI,
370
+ controlnet_input_images=[readImage(inputs)],
371
+ controlnet_module="openpose",
372
+ )
373
+ image = Image.open(io.BytesIO(base64.b64decode(resp["images"][0])))
374
+ updated_image_path = get_new_image_name(inputs, func_name="human-pose")
375
+ image.save(updated_image_path)
376
+ return updated_image_path
377
+
378
+
379
+ class pose2image:
380
+ def inference(self, inputs):
381
+ print("===>Starting pose2image Inference")
382
+ image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
383
+ resp = do_webui_request(
384
+ prompt=instruct_text,
385
+ controlnet_input_images=[readImage(image_path)],
386
+ controlnet_module="none",
387
+ controlnet_model=get_model(pattern='^control_openpose.*'),
388
+ )
389
+ image = Image.open(io.BytesIO(base64.b64decode(resp["images"][0])))
390
+ updated_image_path = get_new_image_name(image_path, func_name="pose2image")
391
+ real_image = Image.fromarray(x_samples[0]) # default the index0 image
392
+ real_image.save(updated_image_path)
393
+ return updated_image_path
394
+
395
+
396
+ class image2seg:
397
+ def inference(self, inputs):
398
+ print("===>Starting image2seg Inference")
399
+ resp = do_webui_request(
400
+ url=DETECTAPI,
401
+ controlnet_input_images=[readImage(inputs)],
402
+ controlnet_module="segmentation",
403
+ )
404
+ image = Image.open(io.BytesIO(base64.b64decode(resp["images"][0])))
405
+ updated_image_path = get_new_image_name(inputs, func_name="segmentation")
406
+ image.save(updated_image_path)
407
+ return updated_image_path
408
+
409
+
410
+ class seg2image:
411
+ def inference(self, inputs):
412
+ print("===>Starting seg2image Inference")
413
+ image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
414
+ resp = do_webui_request(
415
+ prompt=instruct_text,
416
+ controlnet_input_images=[readImage(image_path)],
417
+ controlnet_module="none",
418
+ controlnet_model=get_model(pattern='^control_seg.*'),
419
+ )
420
+ image = Image.open(io.BytesIO(base64.b64decode(resp["images"][0])))
421
+ updated_image_path = get_new_image_name(image_path, func_name="segment2image")
422
+ real_image = Image.fromarray(x_samples[0])
423
+ real_image.save(updated_image_path)
424
+ return updated_image_path
425
+
426
+
427
+ class image2depth:
428
+ def inference(self, inputs):
429
+ print("===>Starting image2depth Inference")
430
+ resp = do_webui_request(
431
+ url=DETECTAPI,
432
+ controlnet_input_images=[readImage(inputs)],
433
+ controlnet_module="depth",
434
+ )
435
+ image = Image.open(io.BytesIO(base64.b64decode(resp["images"][0])))
436
+ updated_image_path = get_new_image_name(inputs, func_name="depth")
437
+ image.save(updated_image_path)
438
+ return updated_image_path
439
+
440
+
441
+ class depth2image:
442
+ def inference(self, inputs):
443
+ print("===>Starting depth2image Inference")
444
+ image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
445
+ resp = do_webui_request(
446
+ prompt=instruct_text,
447
+ controlnet_input_images=[readImage(image_path)],
448
+ controlnet_module="depth",
449
+ controlnet_model=get_model(pattern='^control_depth.*'),
450
+ )
451
+ image = Image.open(io.BytesIO(base64.b64decode(resp["images"][0])))
452
+ updated_image_path = get_new_image_name(image_path, func_name="depth2image")
453
+ real_image = Image.fromarray(x_samples[0]) # default the index0 image
454
+ real_image.save(updated_image_path)
455
+ return updated_image_path
456
+
457
+
458
+ class image2normal:
459
+ def inference(self, inputs):
460
+ print("===>Starting image2 normal Inference")
461
+ resp = do_webui_request(
462
+ url=DETECTAPI,
463
+ controlnet_input_images=[readImage(inputs)],
464
+ controlnet_module="normal",
465
+ )
466
+ image = Image.open(io.BytesIO(base64.b64decode(resp["images"][0])))
467
+ updated_image_path = get_new_image_name(inputs, func_name="normal-map")
468
+ image.save(updated_image_path)
469
+ return updated_image_path
470
+
471
+
472
+ class normal2image:
473
+ def inference(self, inputs):
474
+ print("===>Starting normal2image Inference")
475
+ image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
476
+ resp = do_webui_request(
477
+ prompt=instruct_text,
478
+ controlnet_input_images=[readImage(image_path)],
479
+ controlnet_module="normal",
480
+ controlnet_model=get_model(pattern='^control_normal.*'),
481
+ )
482
+ image = Image.open(io.BytesIO(base64.b64decode(resp["images"][0])))
483
+ updated_image_path = get_new_image_name(image_path, func_name="normal2image")
484
+ real_image = Image.fromarray(x_samples[0]) # default the index0 image
485
+ real_image.save(updated_image_path)
486
+ return updated_image_path
487
+
488
+
489
+ class BLIPVQA:
490
+ def __init__(self, device):
491
+ print("Initializing BLIP VQA to %s" % device)
492
+ self.device = device
493
+ self.processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
494
+ self.model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(self.device)
495
+
496
+ def get_answer_from_question_and_image(self, inputs):
497
+ image_path, question = inputs.split(",")
498
+ raw_image = Image.open(image_path).convert('RGB')
499
+ print(F'BLIPVQA :question :{question}')
500
+ inputs = self.processor(raw_image, question, return_tensors="pt").to(self.device)
501
+ out = self.model.generate(**inputs)
502
+ answer = self.processor.decode(out[0], skip_special_tokens=True)
503
+ return answer
504
+
505
+
506
+ class ConversationBot:
507
+ def __init__(self):
508
+ print("Initializing VisualChatGPT")
509
+ # self.edit = ImageEditing(device=device)
510
+ self.i2t = ImageCaptioning(device=device)
511
+ self.t2i = T2I(device=device)
512
+ self.image2canny = image2canny()
513
+ self.canny2image = canny2image()
514
+ self.image2line = image2line()
515
+ self.line2image = line2image()
516
+ self.image2hed = image2hed()
517
+ self.hed2image = hed2image()
518
+ self.image2scribble = image2scribble()
519
+ self.scribble2image = scribble2image()
520
+ self.image2pose = image2pose()
521
+ self.pose2image = pose2image()
522
+ self.BLIPVQA = BLIPVQA(device=device)
523
+ self.image2seg = image2seg()
524
+ self.seg2image = seg2image()
525
+ self.image2depth = image2depth()
526
+ self.depth2image = depth2image()
527
+ self.image2normal = image2normal()
528
+ self.normal2image = normal2image()
529
+ # self.pix2pix = Pix2Pix(device="cuda:3")
530
+ self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
531
+ self.tools = [
532
+ Tool(name="Get Photo Description", func=self.i2t.inference,
533
+ description="useful when you want to know what is inside the photo. receives image_path as input. "
534
+ "The input to this tool should be a string, representing the image_path. "),
535
+ Tool(name="Generate Image From User Input Text", func=self.t2i.inference,
536
+ description="useful when you want to generate an image from a user input text and save it to a file. like: generate an image of an object or something, or generate an image that includes some objects. "
537
+ "The input to this tool should be a string, representing the text used to generate image. "),
538
+ # Tool(name="Remove Something From The Photo", func=self.edit.remove_part_of_image,
539
+ # description="useful when you want to remove and object or something from the photo from its description or location. "
540
+ # "The input to this tool should be a comma seperated string of two, representing the image_path and the object need to be removed. "),
541
+ # Tool(name="Replace Something From The Photo", func=self.edit.replace_part_of_image,
542
+ # description="useful when you want to replace an object from the object description or location with another object from its description. "
543
+ # "The input to this tool should be a comma seperated string of three, representing the image_path, the object to be replaced, the object to be replaced with "),
544
+
545
+ # Tool(name="Instruct Image Using Text", func=self.pix2pix.inference,
546
+ # description="useful when you want to the style of the image to be like the text. like: make it look like a painting. or make it like a robot. "
547
+ # "The input to this tool should be a comma seperated string of two, representing the image_path and the text. "),
548
+ Tool(name="Answer Question About The Image", func=self.BLIPVQA.get_answer_from_question_and_image,
549
+ description="useful when you need an answer for a question based on an image. like: what is the background color of the last image, how many cats in this figure, what is in this figure. "
550
+ "The input to this tool should be a comma seperated string of two, representing the image_path and the question"),
551
+ Tool(name="Edge Detection On Image", func=self.image2canny.inference,
552
+ description="useful when you want to detect the edge of the image. like: detect the edges of this image, or canny detection on image, or peform edge detection on this image, or detect the canny image of this image. "
553
+ "The input to this tool should be a string, representing the image_path"),
554
+ Tool(name="Generate Image Condition On Canny Image", func=self.canny2image.inference,
555
+ description="useful when you want to generate a new real image from both the user desciption and a canny image. like: generate a real image of a object or something from this canny image, or generate a new real image of a object or something from this edge image. "
556
+ "The input to this tool should be a comma seperated string of two, representing the image_path and the user description. "),
557
+ Tool(name="Line Detection On Image", func=self.image2line.inference,
558
+ description="useful when you want to detect the straight line of the image. like: detect the straight lines of this image, or straight line detection on image, or peform straight line detection on this image, or detect the straight line image of this image. "
559
+ "The input to this tool should be a string, representing the image_path"),
560
+ Tool(name="Generate Image Condition On Line Image", func=self.line2image.inference,
561
+ description="useful when you want to generate a new real image from both the user desciption and a straight line image. like: generate a real image of a object or something from this straight line image, or generate a new real image of a object or something from this straight lines. "
562
+ "The input to this tool should be a comma seperated string of two, representing the image_path and the user description. "),
563
+ Tool(name="Hed Detection On Image", func=self.image2hed.inference,
564
+ description="useful when you want to detect the soft hed boundary of the image. like: detect the soft hed boundary of this image, or hed boundary detection on image, or peform hed boundary detection on this image, or detect soft hed boundary image of this image. "
565
+ "The input to this tool should be a string, representing the image_path"),
566
+ Tool(name="Generate Image Condition On Soft Hed Boundary Image", func=self.hed2image.inference,
567
+ description="useful when you want to generate a new real image from both the user desciption and a soft hed boundary image. like: generate a real image of a object or something from this soft hed boundary image, or generate a new real image of a object or something from this hed boundary. "
568
+ "The input to this tool should be a comma seperated string of two, representing the image_path and the user description"),
569
+ Tool(name="Segmentation On Image", func=self.image2seg.inference,
570
+ description="useful when you want to detect segmentations of the image. like: segment this image, or generate segmentations on this image, or peform segmentation on this image. "
571
+ "The input to this tool should be a string, representing the image_path"),
572
+ Tool(name="Generate Image Condition On Segmentations", func=self.seg2image.inference,
573
+ description="useful when you want to generate a new real image from both the user desciption and segmentations. like: generate a real image of a object or something from this segmentation image, or generate a new real image of a object or something from these segmentations. "
574
+ "The input to this tool should be a comma seperated string of two, representing the image_path and the user description"),
575
+ Tool(name="Predict Depth On Image", func=self.image2depth.inference,
576
+ description="useful when you want to detect depth of the image. like: generate the depth from this image, or detect the depth map on this image, or predict the depth for this image. "
577
+ "The input to this tool should be a string, representing the image_path"),
578
+ Tool(name="Generate Image Condition On Depth", func=self.depth2image.inference,
579
+ description="useful when you want to generate a new real image from both the user desciption and depth image. like: generate a real image of a object or something from this depth image, or generate a new real image of a object or something from the depth map. "
580
+ "The input to this tool should be a comma seperated string of two, representing the image_path and the user description"),
581
+ Tool(name="Predict Normal Map On Image", func=self.image2normal.inference,
582
+ description="useful when you want to detect norm map of the image. like: generate normal map from this image, or predict normal map of this image. "
583
+ "The input to this tool should be a string, representing the image_path"),
584
+ Tool(name="Generate Image Condition On Normal Map", func=self.normal2image.inference,
585
+ description="useful when you want to generate a new real image from both the user desciption and normal map. like: generate a real image of a object or something from this normal map, or generate a new real image of a object or something from the normal map. "
586
+ "The input to this tool should be a comma seperated string of two, representing the image_path and the user description"),
587
+ Tool(name="Sketch Detection On Image", func=self.image2scribble.inference,
588
+ description="useful when you want to generate a scribble of the image. like: generate a scribble of this image, or generate a sketch from this image, detect the sketch from this image. "
589
+ "The input to this tool should be a string, representing the image_path"),
590
+ Tool(name="Generate Image Condition On Sketch Image", func=self.scribble2image.inference,
591
+ description="useful when you want to generate a new real image from both the user desciption and a scribble image or a sketch image. "
592
+ "The input to this tool should be a comma seperated string of two, representing the image_path and the user description"),
593
+ Tool(name="Pose Detection On Image", func=self.image2pose.inference,
594
+ description="useful when you want to detect the human pose of the image. like: generate human poses of this image, or generate a pose image from this image. "
595
+ "The input to this tool should be a string, representing the image_path"),
596
+ Tool(name="Generate Image Condition On Pose Image", func=self.pose2image.inference,
597
+ description="useful when you want to generate a new real image from both the user desciption and a human pose image. like: generate a real image of a human from this human pose image, or generate a new real image of a human from this pose. "
598
+ "The input to this tool should be a comma seperated string of two, representing the image_path and the user description")]
599
+
600
+ def init_langchain(self, openai_api_key):
601
+ self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
602
+ self.agent = initialize_agent(
603
+ self.tools,
604
+ self.llm,
605
+ agent="conversational-react-description",
606
+ verbose=True,
607
+ memory=self.memory,
608
+ return_intermediate_steps=True,
609
+ agent_kwargs={'prefix': VISUAL_CHATGPT_PREFIX, 'format_instructions': VISUAL_CHATGPT_FORMAT_INSTRUCTIONS, 'suffix': VISUAL_CHATGPT_SUFFIX}
610
+ )
611
+
612
+ def run_text(self, openai_api_key, text, state):
613
+ if not hasattr(self, "agent"):
614
+ self.init_langchain(openai_api_key)
615
+ print("===============Running run_text =============")
616
+ print("Inputs:", text, state)
617
+ print("======>Previous memory:\n %s" % self.agent.memory)
618
+ self.agent.memory.buffer = cut_dialogue_history(self.agent.memory.buffer, keep_last_n_words=500)
619
+ res = self.agent({"input": text})
620
+ print("======>Current memory:\n %s" % self.agent.memory)
621
+ response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
622
+ state = state + [(text, response)]
623
+ print("Outputs:", state)
624
+ return state, state
625
+
626
+ def run_image(self, openai_api_key, image, state, txt):
627
+ if not hasattr(self, "agent"):
628
+ self.init_langchain(openai_api_key)
629
+ print("===============Running run_image =============")
630
+ print("Inputs:", image, state)
631
+ print("======>Previous memory:\n %s" % self.agent.memory)
632
+ image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
633
+ print("======>Auto Resize Image...")
634
+ img = Image.open(image.name)
635
+ width, height = img.size
636
+ ratio = min(512 / width, 512 / height)
637
+ width_new, height_new = (round(width * ratio), round(height * ratio))
638
+ img = img.resize((width_new, height_new))
639
+ img = img.convert('RGB')
640
+ img.save(image_filename, "PNG")
641
+ print(f"Resize image form {width}x{height} to {width_new}x{height_new}")
642
+ description = self.i2t.inference(image_filename)
643
+ Human_prompt = "\nHuman: provide a figure named {}. The description is: {}. This information helps you to understand this image, but you should use tools to finish following tasks, " \
644
+ "rather than directly imagine from my description. If you understand, say \"Received\". \n".format(image_filename, description)
645
+ AI_prompt = "Received. "
646
+ self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
647
+ print("======>Current memory:\n %s" % self.agent.memory)
648
+ state = state + [(f"![](/file={image_filename})*{image_filename}*", AI_prompt)]
649
+ print("Outputs:", state)
650
+ return state, state, txt + ' ' + image_filename + ' '
651
+
652
+
653
+ if __name__ == '__main__':
654
+ os.makedirs("image/", exist_ok=True)
655
+ bot = ConversationBot()
656
+ with gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}") as demo:
657
+ openai_api_key = gr.Textbox(type="password", label="Enter your OpenAI API key here")
658
+ chatbot = gr.Chatbot(elem_id="chatbot", label="Visual ChatGPT")
659
+ state = gr.State([])
660
+ with gr.Row():
661
+ with gr.Column(scale=0.7):
662
+ txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter, or upload an image").style(container=False)
663
+ with gr.Column(scale=0.15, min_width=0):
664
+ clear = gr.Button("Clear️")
665
+ with gr.Column(scale=0.15, min_width=0):
666
+ btn = gr.UploadButton("Upload", file_types=["image"])
667
+
668
+ txt.submit(bot.run_text, [openai_api_key, txt, state], [chatbot, state])
669
+ txt.submit(lambda: "", None, txt)
670
+ btn.upload(bot.run_image, [openai_api_key, btn, state, txt], [chatbot, state, txt])
671
+ clear.click(bot.memory.clear)
672
+ clear.click(lambda: [], None, chatbot)
673
+ clear.click(lambda: [], None, state)
674
+
675
+
676
+ demo.launch(server_name="0.0.0.0", server_port=7864)
extensions/microsoftexcel-controlnet/example/visual_chatgpt.ipynb ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "# Run WebUI in API mode\n",
10
+ "nohup python launch.py --api --xformers &\n",
11
+ "\n",
12
+ "# Wait until webui fully startup\n",
13
+ "tail -f nohup.out"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "execution_count": null,
19
+ "metadata": {},
20
+ "outputs": [],
21
+ "source": [
22
+ "# Install/Upgrade transformers\n",
23
+ "pip install -U transformers\n",
24
+ "\n",
25
+ "# Install deps\n",
26
+ "pip install langchain==0.0.101 openai \n",
27
+ "\n",
28
+ "# Run exmaple\n",
29
+ "python example/chatgpt.py"
30
+ ]
31
+ }
32
+ ],
33
+ "metadata": {
34
+ "kernelspec": {
35
+ "display_name": "pynb",
36
+ "language": "python",
37
+ "name": "python3"
38
+ },
39
+ "language_info": {
40
+ "codemirror_mode": {
41
+ "name": "ipython",
42
+ "version": 3
43
+ },
44
+ "file_extension": ".py",
45
+ "mimetype": "text/x-python",
46
+ "name": "python",
47
+ "nbconvert_exporter": "python",
48
+ "pygments_lexer": "ipython3",
49
+ "version": "3.10.9"
50
+ },
51
+ "orig_nbformat": 4,
52
+ "vscode": {
53
+ "interpreter": {
54
+ "hash": "d73345514d8c18d9a1da7351d222dbd2834c7f4a09e728a0d1f4c4580fbec206"
55
+ }
56
+ }
57
+ },
58
+ "nbformat": 4,
59
+ "nbformat_minor": 2
60
+ }
extensions/microsoftexcel-controlnet/extract_controlnet.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import torch
3
+ from safetensors.torch import load_file, save_file
4
+
5
+ if __name__ == "__main__":
6
+ parser = argparse.ArgumentParser()
7
+ parser.add_argument("--src", default=None, type=str, required=True, help="Path to the model to convert.")
8
+ parser.add_argument("--dst", default=None, type=str, required=True, help="Path to the output model.")
9
+ parser.add_argument("--half", action="store_true", help="Cast to FP16.")
10
+ args = parser.parse_args()
11
+
12
+ assert args.src is not None, "Must provide a model path!"
13
+ assert args.dst is not None, "Must provide a checkpoint path!"
14
+
15
+ if args.src.endswith(".safetensors"):
16
+ state_dict = load_file(args.src)
17
+ else:
18
+ state_dict = torch.load(args.src)
19
+
20
+ if any([k.startswith("control_model.") for k, v in state_dict.items()]):
21
+ dtype = torch.float16 if args.half else torch.float32
22
+ state_dict = {k.replace("control_model.", ""): v.to(dtype) for k, v in state_dict.items() if k.startswith("control_model.")}
23
+
24
+ if args.dst.endswith(".safetensors"):
25
+ save_file(state_dict, args.dst)
26
+ else:
27
+ torch.save({"state_dict": state_dict}, args.dst)
extensions/microsoftexcel-controlnet/extract_controlnet_diff.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import torch
3
+ from safetensors.torch import load_file, save_file
4
+
5
+ if __name__ == "__main__":
6
+ parser = argparse.ArgumentParser()
7
+ parser.add_argument("--sd15", default=None, type=str, required=True, help="Path to the original sd15.")
8
+ parser.add_argument("--control", default=None, type=str, required=True, help="Path to the sd15 with control.")
9
+ parser.add_argument("--dst", default=None, type=str, required=True, help="Path to the output difference model.")
10
+ parser.add_argument("--fp16", action="store_true", help="Save as fp16.")
11
+ parser.add_argument("--bf16", action="store_true", help="Save as bf16.")
12
+ args = parser.parse_args()
13
+
14
+ assert args.sd15 is not None, "Must provide a original sd15 model path!"
15
+ assert args.control is not None, "Must provide a sd15 with control model path!"
16
+ assert args.dst is not None, "Must provide a output path!"
17
+
18
+ # make differences: copy from https://github.com/lllyasviel/ControlNet/blob/main/tool_transfer_control.py
19
+
20
+ def get_node_name(name, parent_name):
21
+ if len(name) <= len(parent_name):
22
+ return False, ''
23
+ p = name[:len(parent_name)]
24
+ if p != parent_name:
25
+ return False, ''
26
+ return True, name[len(parent_name):]
27
+
28
+ # remove first/cond stage from sd to reduce memory usage
29
+ def remove_first_and_cond(sd):
30
+ keys = list(sd.keys())
31
+ for key in keys:
32
+ is_first_stage, _ = get_node_name(key, 'first_stage_model')
33
+ is_cond_stage, _ = get_node_name(key, 'cond_stage_model')
34
+ if is_first_stage or is_cond_stage:
35
+ sd.pop(key, None)
36
+ return sd
37
+
38
+ print(f"loading: {args.sd15}")
39
+ if args.sd15.endswith(".safetensors"):
40
+ sd15_state_dict = load_file(args.sd15)
41
+ else:
42
+ sd15_state_dict = torch.load(args.sd15)
43
+ sd15_state_dict = sd15_state_dict.pop("state_dict", sd15_state_dict)
44
+ sd15_state_dict = remove_first_and_cond(sd15_state_dict)
45
+
46
+ print(f"loading: {args.control}")
47
+ if args.control.endswith(".safetensors"):
48
+ control_state_dict = load_file(args.control)
49
+ else:
50
+ control_state_dict = torch.load(args.control)
51
+ control_state_dict = remove_first_and_cond(control_state_dict)
52
+
53
+ # make diff of original and control
54
+ print(f"create difference")
55
+ keys = list(control_state_dict.keys())
56
+ final_state_dict = {"difference": torch.tensor(1.0)} # indicates difference
57
+ for key in keys:
58
+ p = control_state_dict.pop(key)
59
+
60
+ is_control, node_name = get_node_name(key, 'control_')
61
+ if not is_control:
62
+ continue
63
+
64
+ sd15_key_name = 'model.diffusion_' + node_name
65
+ if sd15_key_name in sd15_state_dict: # part of U-Net
66
+ # print("in sd15", key, sd15_key_name)
67
+ p_new = p - sd15_state_dict.pop(sd15_key_name)
68
+ if torch.max(torch.abs(p_new)) < 1e-6: # no difference?
69
+ print("no diff", key, sd15_key_name)
70
+ continue
71
+ else:
72
+ # print("not in sd15", key, sd15_key_name)
73
+ p_new = p # hint or zero_conv
74
+
75
+ final_state_dict[key] = p_new
76
+
77
+ save_dtype = None
78
+ if args.fp16:
79
+ save_dtype = torch.float16
80
+ elif args.bf16:
81
+ save_dtype = torch.bfloat16
82
+ if save_dtype is not None:
83
+ for key in final_state_dict.keys():
84
+ final_state_dict[key] = final_state_dict[key].to(save_dtype)
85
+
86
+ print("saving difference.")
87
+ if args.dst.endswith(".safetensors"):
88
+ save_file(final_state_dict, args.dst)
89
+ else:
90
+ torch.save({"state_dict": final_state_dict}, args.dst)
91
+ print("done!")
extensions/microsoftexcel-controlnet/install.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import launch
2
+ import os
3
+ import pkg_resources
4
+
5
+ req_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), "requirements.txt")
6
+
7
+ with open(req_file) as file:
8
+ for package in file:
9
+ try:
10
+ package = package.strip()
11
+ if '==' in package:
12
+ package_name, package_version = package.split('==')
13
+ installed_version = pkg_resources.get_distribution(package_name).version
14
+ if installed_version != package_version:
15
+ launch.run_pip(f"install {package}", f"sd-webui-controlnet requirement: changing {package_name} version from {installed_version} to {package_version}")
16
+ elif not launch.is_installed(package):
17
+ launch.run_pip(f"install {package}", f"sd-webui-controlnet requirement: {package}")
18
+ except Exception as e:
19
+ print(e)
20
+ print(f'Warning: Failed to install {package}, some preprocessors may not work.')
extensions/microsoftexcel-controlnet/javascript/hints.js ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ onUiUpdate(function () {
2
+ // mouseover tooltips for various UI elements
3
+ const titles = {
4
+ '🔄': 'Refresh',
5
+ '\u2934': 'Send dimensions to stable diffusion',
6
+ '💥': 'Run preprocessor',
7
+ '📝': 'Open new canvas',
8
+ '📷': 'Enable webcam',
9
+ '⇄': 'Mirror webcam',
10
+ };
11
+ gradioApp().querySelectorAll('.cnet-toolbutton').forEach(function (button) {
12
+ const tooltip = titles[button.textContent];
13
+ if (tooltip) {
14
+ button.title = tooltip;
15
+ }
16
+ })
17
+ });
extensions/microsoftexcel-controlnet/models/cldm_v15.yaml ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ target: cldm.cldm.ControlLDM
3
+ params:
4
+ linear_start: 0.00085
5
+ linear_end: 0.0120
6
+ num_timesteps_cond: 1
7
+ log_every_t: 200
8
+ timesteps: 1000
9
+ first_stage_key: "jpg"
10
+ cond_stage_key: "txt"
11
+ control_key: "hint"
12
+ image_size: 64
13
+ channels: 4
14
+ cond_stage_trainable: false
15
+ conditioning_key: crossattn
16
+ monitor: val/loss_simple_ema
17
+ scale_factor: 0.18215
18
+ use_ema: False
19
+ only_mid_control: False
20
+
21
+ control_stage_config:
22
+ target: cldm.cldm.ControlNet
23
+ params:
24
+ image_size: 32 # unused
25
+ in_channels: 4
26
+ hint_channels: 3
27
+ model_channels: 320
28
+ attention_resolutions: [ 4, 2, 1 ]
29
+ num_res_blocks: 2
30
+ channel_mult: [ 1, 2, 4, 4 ]
31
+ num_heads: 8
32
+ use_spatial_transformer: True
33
+ transformer_depth: 1
34
+ context_dim: 768
35
+ use_checkpoint: True
36
+ legacy: False
37
+
38
+ unet_config:
39
+ target: cldm.cldm.ControlledUnetModel
40
+ params:
41
+ image_size: 32 # unused
42
+ in_channels: 4
43
+ out_channels: 4
44
+ model_channels: 320
45
+ attention_resolutions: [ 4, 2, 1 ]
46
+ num_res_blocks: 2
47
+ channel_mult: [ 1, 2, 4, 4 ]
48
+ num_heads: 8
49
+ use_spatial_transformer: True
50
+ transformer_depth: 1
51
+ context_dim: 768
52
+ use_checkpoint: True
53
+ legacy: False
54
+
55
+ first_stage_config:
56
+ target: ldm.models.autoencoder.AutoencoderKL
57
+ params:
58
+ embed_dim: 4
59
+ monitor: val/rec_loss
60
+ ddconfig:
61
+ double_z: true
62
+ z_channels: 4
63
+ resolution: 256
64
+ in_channels: 3
65
+ out_ch: 3
66
+ ch: 128
67
+ ch_mult:
68
+ - 1
69
+ - 2
70
+ - 4
71
+ - 4
72
+ num_res_blocks: 2
73
+ attn_resolutions: []
74
+ dropout: 0.0
75
+ lossconfig:
76
+ target: torch.nn.Identity
77
+
78
+ cond_stage_config:
79
+ target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
extensions/microsoftexcel-controlnet/models/cldm_v21.yaml ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ target: cldm.cldm.ControlLDM
3
+ params:
4
+ linear_start: 0.00085
5
+ linear_end: 0.0120
6
+ num_timesteps_cond: 1
7
+ log_every_t: 200
8
+ timesteps: 1000
9
+ first_stage_key: "jpg"
10
+ cond_stage_key: "txt"
11
+ control_key: "hint"
12
+ image_size: 64
13
+ channels: 4
14
+ cond_stage_trainable: false
15
+ conditioning_key: crossattn
16
+ monitor: val/loss_simple_ema
17
+ scale_factor: 0.18215
18
+ use_ema: False
19
+ only_mid_control: False
20
+
21
+ control_stage_config:
22
+ target: cldm.cldm.ControlNet
23
+ params:
24
+ use_checkpoint: True
25
+ image_size: 32 # unused
26
+ in_channels: 4
27
+ hint_channels: 3
28
+ model_channels: 320
29
+ attention_resolutions: [ 4, 2, 1 ]
30
+ num_res_blocks: 2
31
+ channel_mult: [ 1, 2, 4, 4 ]
32
+ num_head_channels: 64 # need to fix for flash-attn
33
+ use_spatial_transformer: True
34
+ use_linear_in_transformer: True
35
+ transformer_depth: 1
36
+ context_dim: 1024
37
+ legacy: False
38
+
39
+ unet_config:
40
+ target: cldm.cldm.ControlledUnetModel
41
+ params:
42
+ use_checkpoint: True
43
+ image_size: 32 # unused
44
+ in_channels: 4
45
+ out_channels: 4
46
+ model_channels: 320
47
+ attention_resolutions: [ 4, 2, 1 ]
48
+ num_res_blocks: 2
49
+ channel_mult: [ 1, 2, 4, 4 ]
50
+ num_head_channels: 64 # need to fix for flash-attn
51
+ use_spatial_transformer: True
52
+ use_linear_in_transformer: True
53
+ transformer_depth: 1
54
+ context_dim: 1024
55
+ legacy: False
56
+
57
+ first_stage_config:
58
+ target: ldm.models.autoencoder.AutoencoderKL
59
+ params:
60
+ embed_dim: 4
61
+ monitor: val/rec_loss
62
+ ddconfig:
63
+ #attn_type: "vanilla-xformers"
64
+ double_z: true
65
+ z_channels: 4
66
+ resolution: 256
67
+ in_channels: 3
68
+ out_ch: 3
69
+ ch: 128
70
+ ch_mult:
71
+ - 1
72
+ - 2
73
+ - 4
74
+ - 4
75
+ num_res_blocks: 2
76
+ attn_resolutions: []
77
+ dropout: 0.0
78
+ lossconfig:
79
+ target: torch.nn.Identity
80
+
81
+ cond_stage_config:
82
+ target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
83
+ params:
84
+ freeze: True
85
+ layer: "penultimate"
extensions/microsoftexcel-controlnet/models/control_sd15_canny.yaml ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ target: cldm.cldm.ControlLDM
3
+ params:
4
+ linear_start: 0.00085
5
+ linear_end: 0.0120
6
+ num_timesteps_cond: 1
7
+ log_every_t: 200
8
+ timesteps: 1000
9
+ first_stage_key: "jpg"
10
+ cond_stage_key: "txt"
11
+ control_key: "hint"
12
+ image_size: 64
13
+ channels: 4
14
+ cond_stage_trainable: false
15
+ conditioning_key: crossattn
16
+ monitor: val/loss_simple_ema
17
+ scale_factor: 0.18215
18
+ use_ema: False
19
+ only_mid_control: False
20
+
21
+ control_stage_config:
22
+ target: cldm.cldm.ControlNet
23
+ params:
24
+ image_size: 32 # unused
25
+ in_channels: 4
26
+ hint_channels: 3
27
+ model_channels: 320
28
+ attention_resolutions: [ 4, 2, 1 ]
29
+ num_res_blocks: 2
30
+ channel_mult: [ 1, 2, 4, 4 ]
31
+ num_heads: 8
32
+ use_spatial_transformer: True
33
+ transformer_depth: 1
34
+ context_dim: 768
35
+ use_checkpoint: True
36
+ legacy: False
37
+
38
+ unet_config:
39
+ target: cldm.cldm.ControlledUnetModel
40
+ params:
41
+ image_size: 32 # unused
42
+ in_channels: 4
43
+ out_channels: 4
44
+ model_channels: 320
45
+ attention_resolutions: [ 4, 2, 1 ]
46
+ num_res_blocks: 2
47
+ channel_mult: [ 1, 2, 4, 4 ]
48
+ num_heads: 8
49
+ use_spatial_transformer: True
50
+ transformer_depth: 1
51
+ context_dim: 768
52
+ use_checkpoint: True
53
+ legacy: False
54
+
55
+ first_stage_config:
56
+ target: ldm.models.autoencoder.AutoencoderKL
57
+ params:
58
+ embed_dim: 4
59
+ monitor: val/rec_loss
60
+ ddconfig:
61
+ double_z: true
62
+ z_channels: 4
63
+ resolution: 256
64
+ in_channels: 3
65
+ out_ch: 3
66
+ ch: 128
67
+ ch_mult:
68
+ - 1
69
+ - 2
70
+ - 4
71
+ - 4
72
+ num_res_blocks: 2
73
+ attn_resolutions: []
74
+ dropout: 0.0
75
+ lossconfig:
76
+ target: torch.nn.Identity
77
+
78
+ cond_stage_config:
79
+ target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
extensions/microsoftexcel-controlnet/models/control_sd15_depth.yaml ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ target: cldm.cldm.ControlLDM
3
+ params:
4
+ linear_start: 0.00085
5
+ linear_end: 0.0120
6
+ num_timesteps_cond: 1
7
+ log_every_t: 200
8
+ timesteps: 1000
9
+ first_stage_key: "jpg"
10
+ cond_stage_key: "txt"
11
+ control_key: "hint"
12
+ image_size: 64
13
+ channels: 4
14
+ cond_stage_trainable: false
15
+ conditioning_key: crossattn
16
+ monitor: val/loss_simple_ema
17
+ scale_factor: 0.18215
18
+ use_ema: False
19
+ only_mid_control: False
20
+
21
+ control_stage_config:
22
+ target: cldm.cldm.ControlNet
23
+ params:
24
+ image_size: 32 # unused
25
+ in_channels: 4
26
+ hint_channels: 3
27
+ model_channels: 320
28
+ attention_resolutions: [ 4, 2, 1 ]
29
+ num_res_blocks: 2
30
+ channel_mult: [ 1, 2, 4, 4 ]
31
+ num_heads: 8
32
+ use_spatial_transformer: True
33
+ transformer_depth: 1
34
+ context_dim: 768
35
+ use_checkpoint: True
36
+ legacy: False
37
+
38
+ unet_config:
39
+ target: cldm.cldm.ControlledUnetModel
40
+ params:
41
+ image_size: 32 # unused
42
+ in_channels: 4
43
+ out_channels: 4
44
+ model_channels: 320
45
+ attention_resolutions: [ 4, 2, 1 ]
46
+ num_res_blocks: 2
47
+ channel_mult: [ 1, 2, 4, 4 ]
48
+ num_heads: 8
49
+ use_spatial_transformer: True
50
+ transformer_depth: 1
51
+ context_dim: 768
52
+ use_checkpoint: True
53
+ legacy: False
54
+
55
+ first_stage_config:
56
+ target: ldm.models.autoencoder.AutoencoderKL
57
+ params:
58
+ embed_dim: 4
59
+ monitor: val/rec_loss
60
+ ddconfig:
61
+ double_z: true
62
+ z_channels: 4
63
+ resolution: 256
64
+ in_channels: 3
65
+ out_ch: 3
66
+ ch: 128
67
+ ch_mult:
68
+ - 1
69
+ - 2
70
+ - 4
71
+ - 4
72
+ num_res_blocks: 2
73
+ attn_resolutions: []
74
+ dropout: 0.0
75
+ lossconfig:
76
+ target: torch.nn.Identity
77
+
78
+ cond_stage_config:
79
+ target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
extensions/microsoftexcel-controlnet/models/control_sd15_hed.yaml ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ target: cldm.cldm.ControlLDM
3
+ params:
4
+ linear_start: 0.00085
5
+ linear_end: 0.0120
6
+ num_timesteps_cond: 1
7
+ log_every_t: 200
8
+ timesteps: 1000
9
+ first_stage_key: "jpg"
10
+ cond_stage_key: "txt"
11
+ control_key: "hint"
12
+ image_size: 64
13
+ channels: 4
14
+ cond_stage_trainable: false
15
+ conditioning_key: crossattn
16
+ monitor: val/loss_simple_ema
17
+ scale_factor: 0.18215
18
+ use_ema: False
19
+ only_mid_control: False
20
+
21
+ control_stage_config:
22
+ target: cldm.cldm.ControlNet
23
+ params:
24
+ image_size: 32 # unused
25
+ in_channels: 4
26
+ hint_channels: 3
27
+ model_channels: 320
28
+ attention_resolutions: [ 4, 2, 1 ]
29
+ num_res_blocks: 2
30
+ channel_mult: [ 1, 2, 4, 4 ]
31
+ num_heads: 8
32
+ use_spatial_transformer: True
33
+ transformer_depth: 1
34
+ context_dim: 768
35
+ use_checkpoint: True
36
+ legacy: False
37
+
38
+ unet_config:
39
+ target: cldm.cldm.ControlledUnetModel
40
+ params:
41
+ image_size: 32 # unused
42
+ in_channels: 4
43
+ out_channels: 4
44
+ model_channels: 320
45
+ attention_resolutions: [ 4, 2, 1 ]
46
+ num_res_blocks: 2
47
+ channel_mult: [ 1, 2, 4, 4 ]
48
+ num_heads: 8
49
+ use_spatial_transformer: True
50
+ transformer_depth: 1
51
+ context_dim: 768
52
+ use_checkpoint: True
53
+ legacy: False
54
+
55
+ first_stage_config:
56
+ target: ldm.models.autoencoder.AutoencoderKL
57
+ params:
58
+ embed_dim: 4
59
+ monitor: val/rec_loss
60
+ ddconfig:
61
+ double_z: true
62
+ z_channels: 4
63
+ resolution: 256
64
+ in_channels: 3
65
+ out_ch: 3
66
+ ch: 128
67
+ ch_mult:
68
+ - 1
69
+ - 2
70
+ - 4
71
+ - 4
72
+ num_res_blocks: 2
73
+ attn_resolutions: []
74
+ dropout: 0.0
75
+ lossconfig:
76
+ target: torch.nn.Identity
77
+
78
+ cond_stage_config:
79
+ target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
extensions/microsoftexcel-controlnet/models/control_sd15_mlsd.yaml ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ target: cldm.cldm.ControlLDM
3
+ params:
4
+ linear_start: 0.00085
5
+ linear_end: 0.0120
6
+ num_timesteps_cond: 1
7
+ log_every_t: 200
8
+ timesteps: 1000
9
+ first_stage_key: "jpg"
10
+ cond_stage_key: "txt"
11
+ control_key: "hint"
12
+ image_size: 64
13
+ channels: 4
14
+ cond_stage_trainable: false
15
+ conditioning_key: crossattn
16
+ monitor: val/loss_simple_ema
17
+ scale_factor: 0.18215
18
+ use_ema: False
19
+ only_mid_control: False
20
+
21
+ control_stage_config:
22
+ target: cldm.cldm.ControlNet
23
+ params:
24
+ image_size: 32 # unused
25
+ in_channels: 4
26
+ hint_channels: 3
27
+ model_channels: 320
28
+ attention_resolutions: [ 4, 2, 1 ]
29
+ num_res_blocks: 2
30
+ channel_mult: [ 1, 2, 4, 4 ]
31
+ num_heads: 8
32
+ use_spatial_transformer: True
33
+ transformer_depth: 1
34
+ context_dim: 768
35
+ use_checkpoint: True
36
+ legacy: False
37
+
38
+ unet_config:
39
+ target: cldm.cldm.ControlledUnetModel
40
+ params:
41
+ image_size: 32 # unused
42
+ in_channels: 4
43
+ out_channels: 4
44
+ model_channels: 320
45
+ attention_resolutions: [ 4, 2, 1 ]
46
+ num_res_blocks: 2
47
+ channel_mult: [ 1, 2, 4, 4 ]
48
+ num_heads: 8
49
+ use_spatial_transformer: True
50
+ transformer_depth: 1
51
+ context_dim: 768
52
+ use_checkpoint: True
53
+ legacy: False
54
+
55
+ first_stage_config:
56
+ target: ldm.models.autoencoder.AutoencoderKL
57
+ params:
58
+ embed_dim: 4
59
+ monitor: val/rec_loss
60
+ ddconfig:
61
+ double_z: true
62
+ z_channels: 4
63
+ resolution: 256
64
+ in_channels: 3
65
+ out_ch: 3
66
+ ch: 128
67
+ ch_mult:
68
+ - 1
69
+ - 2
70
+ - 4
71
+ - 4
72
+ num_res_blocks: 2
73
+ attn_resolutions: []
74
+ dropout: 0.0
75
+ lossconfig:
76
+ target: torch.nn.Identity
77
+
78
+ cond_stage_config:
79
+ target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
extensions/microsoftexcel-controlnet/models/control_sd15_normal.yaml ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ target: cldm.cldm.ControlLDM
3
+ params:
4
+ linear_start: 0.00085
5
+ linear_end: 0.0120
6
+ num_timesteps_cond: 1
7
+ log_every_t: 200
8
+ timesteps: 1000
9
+ first_stage_key: "jpg"
10
+ cond_stage_key: "txt"
11
+ control_key: "hint"
12
+ image_size: 64
13
+ channels: 4
14
+ cond_stage_trainable: false
15
+ conditioning_key: crossattn
16
+ monitor: val/loss_simple_ema
17
+ scale_factor: 0.18215
18
+ use_ema: False
19
+ only_mid_control: False
20
+
21
+ control_stage_config:
22
+ target: cldm.cldm.ControlNet
23
+ params:
24
+ image_size: 32 # unused
25
+ in_channels: 4
26
+ hint_channels: 3
27
+ model_channels: 320
28
+ attention_resolutions: [ 4, 2, 1 ]
29
+ num_res_blocks: 2
30
+ channel_mult: [ 1, 2, 4, 4 ]
31
+ num_heads: 8
32
+ use_spatial_transformer: True
33
+ transformer_depth: 1
34
+ context_dim: 768
35
+ use_checkpoint: True
36
+ legacy: False
37
+
38
+ unet_config:
39
+ target: cldm.cldm.ControlledUnetModel
40
+ params:
41
+ image_size: 32 # unused
42
+ in_channels: 4
43
+ out_channels: 4
44
+ model_channels: 320
45
+ attention_resolutions: [ 4, 2, 1 ]
46
+ num_res_blocks: 2
47
+ channel_mult: [ 1, 2, 4, 4 ]
48
+ num_heads: 8
49
+ use_spatial_transformer: True
50
+ transformer_depth: 1
51
+ context_dim: 768
52
+ use_checkpoint: True
53
+ legacy: False
54
+
55
+ first_stage_config:
56
+ target: ldm.models.autoencoder.AutoencoderKL
57
+ params:
58
+ embed_dim: 4
59
+ monitor: val/rec_loss
60
+ ddconfig:
61
+ double_z: true
62
+ z_channels: 4
63
+ resolution: 256
64
+ in_channels: 3
65
+ out_ch: 3
66
+ ch: 128
67
+ ch_mult:
68
+ - 1
69
+ - 2
70
+ - 4
71
+ - 4
72
+ num_res_blocks: 2
73
+ attn_resolutions: []
74
+ dropout: 0.0
75
+ lossconfig:
76
+ target: torch.nn.Identity
77
+
78
+ cond_stage_config:
79
+ target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
extensions/microsoftexcel-controlnet/models/control_sd15_openpose.yaml ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ target: cldm.cldm.ControlLDM
3
+ params:
4
+ linear_start: 0.00085
5
+ linear_end: 0.0120
6
+ num_timesteps_cond: 1
7
+ log_every_t: 200
8
+ timesteps: 1000
9
+ first_stage_key: "jpg"
10
+ cond_stage_key: "txt"
11
+ control_key: "hint"
12
+ image_size: 64
13
+ channels: 4
14
+ cond_stage_trainable: false
15
+ conditioning_key: crossattn
16
+ monitor: val/loss_simple_ema
17
+ scale_factor: 0.18215
18
+ use_ema: False
19
+ only_mid_control: False
20
+
21
+ control_stage_config:
22
+ target: cldm.cldm.ControlNet
23
+ params:
24
+ image_size: 32 # unused
25
+ in_channels: 4
26
+ hint_channels: 3
27
+ model_channels: 320
28
+ attention_resolutions: [ 4, 2, 1 ]
29
+ num_res_blocks: 2
30
+ channel_mult: [ 1, 2, 4, 4 ]
31
+ num_heads: 8
32
+ use_spatial_transformer: True
33
+ transformer_depth: 1
34
+ context_dim: 768
35
+ use_checkpoint: True
36
+ legacy: False
37
+
38
+ unet_config:
39
+ target: cldm.cldm.ControlledUnetModel
40
+ params:
41
+ image_size: 32 # unused
42
+ in_channels: 4
43
+ out_channels: 4
44
+ model_channels: 320
45
+ attention_resolutions: [ 4, 2, 1 ]
46
+ num_res_blocks: 2
47
+ channel_mult: [ 1, 2, 4, 4 ]
48
+ num_heads: 8
49
+ use_spatial_transformer: True
50
+ transformer_depth: 1
51
+ context_dim: 768
52
+ use_checkpoint: True
53
+ legacy: False
54
+
55
+ first_stage_config:
56
+ target: ldm.models.autoencoder.AutoencoderKL
57
+ params:
58
+ embed_dim: 4
59
+ monitor: val/rec_loss
60
+ ddconfig:
61
+ double_z: true
62
+ z_channels: 4
63
+ resolution: 256
64
+ in_channels: 3
65
+ out_ch: 3
66
+ ch: 128
67
+ ch_mult:
68
+ - 1
69
+ - 2
70
+ - 4
71
+ - 4
72
+ num_res_blocks: 2
73
+ attn_resolutions: []
74
+ dropout: 0.0
75
+ lossconfig:
76
+ target: torch.nn.Identity
77
+
78
+ cond_stage_config:
79
+ target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
extensions/microsoftexcel-controlnet/models/control_sd15_scribble.yaml ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ target: cldm.cldm.ControlLDM
3
+ params:
4
+ linear_start: 0.00085
5
+ linear_end: 0.0120
6
+ num_timesteps_cond: 1
7
+ log_every_t: 200
8
+ timesteps: 1000
9
+ first_stage_key: "jpg"
10
+ cond_stage_key: "txt"
11
+ control_key: "hint"
12
+ image_size: 64
13
+ channels: 4
14
+ cond_stage_trainable: false
15
+ conditioning_key: crossattn
16
+ monitor: val/loss_simple_ema
17
+ scale_factor: 0.18215
18
+ use_ema: False
19
+ only_mid_control: False
20
+
21
+ control_stage_config:
22
+ target: cldm.cldm.ControlNet
23
+ params:
24
+ image_size: 32 # unused
25
+ in_channels: 4
26
+ hint_channels: 3
27
+ model_channels: 320
28
+ attention_resolutions: [ 4, 2, 1 ]
29
+ num_res_blocks: 2
30
+ channel_mult: [ 1, 2, 4, 4 ]
31
+ num_heads: 8
32
+ use_spatial_transformer: True
33
+ transformer_depth: 1
34
+ context_dim: 768
35
+ use_checkpoint: True
36
+ legacy: False
37
+
38
+ unet_config:
39
+ target: cldm.cldm.ControlledUnetModel
40
+ params:
41
+ image_size: 32 # unused
42
+ in_channels: 4
43
+ out_channels: 4
44
+ model_channels: 320
45
+ attention_resolutions: [ 4, 2, 1 ]
46
+ num_res_blocks: 2
47
+ channel_mult: [ 1, 2, 4, 4 ]
48
+ num_heads: 8
49
+ use_spatial_transformer: True
50
+ transformer_depth: 1
51
+ context_dim: 768
52
+ use_checkpoint: True
53
+ legacy: False
54
+
55
+ first_stage_config:
56
+ target: ldm.models.autoencoder.AutoencoderKL
57
+ params:
58
+ embed_dim: 4
59
+ monitor: val/rec_loss
60
+ ddconfig:
61
+ double_z: true
62
+ z_channels: 4
63
+ resolution: 256
64
+ in_channels: 3
65
+ out_ch: 3
66
+ ch: 128
67
+ ch_mult:
68
+ - 1
69
+ - 2
70
+ - 4
71
+ - 4
72
+ num_res_blocks: 2
73
+ attn_resolutions: []
74
+ dropout: 0.0
75
+ lossconfig:
76
+ target: torch.nn.Identity
77
+
78
+ cond_stage_config:
79
+ target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
extensions/microsoftexcel-controlnet/models/control_sd15_seg.yaml ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ target: cldm.cldm.ControlLDM
3
+ params:
4
+ linear_start: 0.00085
5
+ linear_end: 0.0120
6
+ num_timesteps_cond: 1
7
+ log_every_t: 200
8
+ timesteps: 1000
9
+ first_stage_key: "jpg"
10
+ cond_stage_key: "txt"
11
+ control_key: "hint"
12
+ image_size: 64
13
+ channels: 4
14
+ cond_stage_trainable: false
15
+ conditioning_key: crossattn
16
+ monitor: val/loss_simple_ema
17
+ scale_factor: 0.18215
18
+ use_ema: False
19
+ only_mid_control: False
20
+
21
+ control_stage_config:
22
+ target: cldm.cldm.ControlNet
23
+ params:
24
+ image_size: 32 # unused
25
+ in_channels: 4
26
+ hint_channels: 3
27
+ model_channels: 320
28
+ attention_resolutions: [ 4, 2, 1 ]
29
+ num_res_blocks: 2
30
+ channel_mult: [ 1, 2, 4, 4 ]
31
+ num_heads: 8
32
+ use_spatial_transformer: True
33
+ transformer_depth: 1
34
+ context_dim: 768
35
+ use_checkpoint: True
36
+ legacy: False
37
+
38
+ unet_config:
39
+ target: cldm.cldm.ControlledUnetModel
40
+ params:
41
+ image_size: 32 # unused
42
+ in_channels: 4
43
+ out_channels: 4
44
+ model_channels: 320
45
+ attention_resolutions: [ 4, 2, 1 ]
46
+ num_res_blocks: 2
47
+ channel_mult: [ 1, 2, 4, 4 ]
48
+ num_heads: 8
49
+ use_spatial_transformer: True
50
+ transformer_depth: 1
51
+ context_dim: 768
52
+ use_checkpoint: True
53
+ legacy: False
54
+
55
+ first_stage_config:
56
+ target: ldm.models.autoencoder.AutoencoderKL
57
+ params:
58
+ embed_dim: 4
59
+ monitor: val/rec_loss
60
+ ddconfig:
61
+ double_z: true
62
+ z_channels: 4
63
+ resolution: 256
64
+ in_channels: 3
65
+ out_ch: 3
66
+ ch: 128
67
+ ch_mult:
68
+ - 1
69
+ - 2
70
+ - 4
71
+ - 4
72
+ num_res_blocks: 2
73
+ attn_resolutions: []
74
+ dropout: 0.0
75
+ lossconfig:
76
+ target: torch.nn.Identity
77
+
78
+ cond_stage_config:
79
+ target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
extensions/microsoftexcel-controlnet/models/control_v11e_sd15_ip2p.yaml ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ target: cldm.cldm.ControlLDM
3
+ params:
4
+ linear_start: 0.00085
5
+ linear_end: 0.0120
6
+ num_timesteps_cond: 1
7
+ log_every_t: 200
8
+ timesteps: 1000
9
+ first_stage_key: "jpg"
10
+ cond_stage_key: "txt"
11
+ control_key: "hint"
12
+ image_size: 64
13
+ channels: 4
14
+ cond_stage_trainable: false
15
+ conditioning_key: crossattn
16
+ monitor: val/loss_simple_ema
17
+ scale_factor: 0.18215
18
+ use_ema: False
19
+ only_mid_control: False
20
+
21
+ control_stage_config:
22
+ target: cldm.cldm.ControlNet
23
+ params:
24
+ image_size: 32 # unused
25
+ in_channels: 4
26
+ hint_channels: 3
27
+ model_channels: 320
28
+ attention_resolutions: [ 4, 2, 1 ]
29
+ num_res_blocks: 2
30
+ channel_mult: [ 1, 2, 4, 4 ]
31
+ num_heads: 8
32
+ use_spatial_transformer: True
33
+ transformer_depth: 1
34
+ context_dim: 768
35
+ use_checkpoint: True
36
+ legacy: False
37
+
38
+ unet_config:
39
+ target: cldm.cldm.ControlledUnetModel
40
+ params:
41
+ image_size: 32 # unused
42
+ in_channels: 4
43
+ out_channels: 4
44
+ model_channels: 320
45
+ attention_resolutions: [ 4, 2, 1 ]
46
+ num_res_blocks: 2
47
+ channel_mult: [ 1, 2, 4, 4 ]
48
+ num_heads: 8
49
+ use_spatial_transformer: True
50
+ transformer_depth: 1
51
+ context_dim: 768
52
+ use_checkpoint: True
53
+ legacy: False
54
+
55
+ first_stage_config:
56
+ target: ldm.models.autoencoder.AutoencoderKL
57
+ params:
58
+ embed_dim: 4
59
+ monitor: val/rec_loss
60
+ ddconfig:
61
+ double_z: true
62
+ z_channels: 4
63
+ resolution: 256
64
+ in_channels: 3
65
+ out_ch: 3
66
+ ch: 128
67
+ ch_mult:
68
+ - 1
69
+ - 2
70
+ - 4
71
+ - 4
72
+ num_res_blocks: 2
73
+ attn_resolutions: []
74
+ dropout: 0.0
75
+ lossconfig:
76
+ target: torch.nn.Identity
77
+
78
+ cond_stage_config:
79
+ target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
extensions/microsoftexcel-controlnet/models/control_v11e_sd15_shuffle.yaml ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ target: cldm.cldm.ControlLDM
3
+ params:
4
+ linear_start: 0.00085
5
+ linear_end: 0.0120
6
+ num_timesteps_cond: 1
7
+ log_every_t: 200
8
+ timesteps: 1000
9
+ first_stage_key: "jpg"
10
+ cond_stage_key: "txt"
11
+ control_key: "hint"
12
+ image_size: 64
13
+ channels: 4
14
+ cond_stage_trainable: false
15
+ conditioning_key: crossattn
16
+ monitor: val/loss_simple_ema
17
+ scale_factor: 0.18215
18
+ use_ema: False
19
+ only_mid_control: False
20
+ global_average_pooling: True
21
+
22
+ control_stage_config:
23
+ target: cldm.cldm.ControlNet
24
+ params:
25
+ image_size: 32 # unused
26
+ in_channels: 4
27
+ hint_channels: 3
28
+ model_channels: 320
29
+ attention_resolutions: [ 4, 2, 1 ]
30
+ num_res_blocks: 2
31
+ channel_mult: [ 1, 2, 4, 4 ]
32
+ num_heads: 8
33
+ use_spatial_transformer: True
34
+ transformer_depth: 1
35
+ context_dim: 768
36
+ use_checkpoint: True
37
+ legacy: False
38
+
39
+ unet_config:
40
+ target: cldm.cldm.ControlledUnetModel
41
+ params:
42
+ image_size: 32 # unused
43
+ in_channels: 4
44
+ out_channels: 4
45
+ model_channels: 320
46
+ attention_resolutions: [ 4, 2, 1 ]
47
+ num_res_blocks: 2
48
+ channel_mult: [ 1, 2, 4, 4 ]
49
+ num_heads: 8
50
+ use_spatial_transformer: True
51
+ transformer_depth: 1
52
+ context_dim: 768
53
+ use_checkpoint: True
54
+ legacy: False
55
+
56
+ first_stage_config:
57
+ target: ldm.models.autoencoder.AutoencoderKL
58
+ params:
59
+ embed_dim: 4
60
+ monitor: val/rec_loss
61
+ ddconfig:
62
+ double_z: true
63
+ z_channels: 4
64
+ resolution: 256
65
+ in_channels: 3
66
+ out_ch: 3
67
+ ch: 128
68
+ ch_mult:
69
+ - 1
70
+ - 2
71
+ - 4
72
+ - 4
73
+ num_res_blocks: 2
74
+ attn_resolutions: []
75
+ dropout: 0.0
76
+ lossconfig:
77
+ target: torch.nn.Identity
78
+
79
+ cond_stage_config:
80
+ target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
extensions/microsoftexcel-controlnet/models/control_v11f1e_sd15_tile.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f31868eedb243a77932e3c63907a6ba0a2058b6d65b5c27b89ee1b7f618ea33
3
+ size 722601104
extensions/microsoftexcel-controlnet/models/control_v11f1e_sd15_tile.yaml ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ target: cldm.cldm.ControlLDM
3
+ params:
4
+ linear_start: 0.00085
5
+ linear_end: 0.0120
6
+ num_timesteps_cond: 1
7
+ log_every_t: 200
8
+ timesteps: 1000
9
+ first_stage_key: "jpg"
10
+ cond_stage_key: "txt"
11
+ control_key: "hint"
12
+ image_size: 64
13
+ channels: 4
14
+ cond_stage_trainable: false
15
+ conditioning_key: crossattn
16
+ monitor: val/loss_simple_ema
17
+ scale_factor: 0.18215
18
+ use_ema: False
19
+ only_mid_control: False
20
+
21
+ control_stage_config:
22
+ target: cldm.cldm.ControlNet
23
+ params:
24
+ image_size: 32 # unused
25
+ in_channels: 4
26
+ hint_channels: 3
27
+ model_channels: 320
28
+ attention_resolutions: [ 4, 2, 1 ]
29
+ num_res_blocks: 2
30
+ channel_mult: [ 1, 2, 4, 4 ]
31
+ num_heads: 8
32
+ use_spatial_transformer: True
33
+ transformer_depth: 1
34
+ context_dim: 768
35
+ use_checkpoint: True
36
+ legacy: False
37
+
38
+ unet_config:
39
+ target: cldm.cldm.ControlledUnetModel
40
+ params:
41
+ image_size: 32 # unused
42
+ in_channels: 4
43
+ out_channels: 4
44
+ model_channels: 320
45
+ attention_resolutions: [ 4, 2, 1 ]
46
+ num_res_blocks: 2
47
+ channel_mult: [ 1, 2, 4, 4 ]
48
+ num_heads: 8
49
+ use_spatial_transformer: True
50
+ transformer_depth: 1
51
+ context_dim: 768
52
+ use_checkpoint: True
53
+ legacy: False
54
+
55
+ first_stage_config:
56
+ target: ldm.models.autoencoder.AutoencoderKL
57
+ params:
58
+ embed_dim: 4
59
+ monitor: val/rec_loss
60
+ ddconfig:
61
+ double_z: true
62
+ z_channels: 4
63
+ resolution: 256
64
+ in_channels: 3
65
+ out_ch: 3
66
+ ch: 128
67
+ ch_mult:
68
+ - 1
69
+ - 2
70
+ - 4
71
+ - 4
72
+ num_res_blocks: 2
73
+ attn_resolutions: []
74
+ dropout: 0.0
75
+ lossconfig:
76
+ target: torch.nn.Identity
77
+
78
+ cond_stage_config:
79
+ target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
extensions/microsoftexcel-controlnet/models/control_v11f1p_sd15_depth.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6bab8043519c0f563853459c1e4f4e93445a87cef1dcdfa3e1e70115b3c83553
3
+ size 722601100
extensions/microsoftexcel-controlnet/models/control_v11f1p_sd15_depth.yaml ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ target: cldm.cldm.ControlLDM
3
+ params:
4
+ linear_start: 0.00085
5
+ linear_end: 0.0120
6
+ num_timesteps_cond: 1
7
+ log_every_t: 200
8
+ timesteps: 1000
9
+ first_stage_key: "jpg"
10
+ cond_stage_key: "txt"
11
+ control_key: "hint"
12
+ image_size: 64
13
+ channels: 4
14
+ cond_stage_trainable: false
15
+ conditioning_key: crossattn
16
+ monitor: val/loss_simple_ema
17
+ scale_factor: 0.18215
18
+ use_ema: False
19
+ only_mid_control: False
20
+
21
+ control_stage_config:
22
+ target: cldm.cldm.ControlNet
23
+ params:
24
+ image_size: 32 # unused
25
+ in_channels: 4
26
+ hint_channels: 3
27
+ model_channels: 320
28
+ attention_resolutions: [ 4, 2, 1 ]
29
+ num_res_blocks: 2
30
+ channel_mult: [ 1, 2, 4, 4 ]
31
+ num_heads: 8
32
+ use_spatial_transformer: True
33
+ transformer_depth: 1
34
+ context_dim: 768
35
+ use_checkpoint: True
36
+ legacy: False
37
+
38
+ unet_config:
39
+ target: cldm.cldm.ControlledUnetModel
40
+ params:
41
+ image_size: 32 # unused
42
+ in_channels: 4
43
+ out_channels: 4
44
+ model_channels: 320
45
+ attention_resolutions: [ 4, 2, 1 ]
46
+ num_res_blocks: 2
47
+ channel_mult: [ 1, 2, 4, 4 ]
48
+ num_heads: 8
49
+ use_spatial_transformer: True
50
+ transformer_depth: 1
51
+ context_dim: 768
52
+ use_checkpoint: True
53
+ legacy: False
54
+
55
+ first_stage_config:
56
+ target: ldm.models.autoencoder.AutoencoderKL
57
+ params:
58
+ embed_dim: 4
59
+ monitor: val/rec_loss
60
+ ddconfig:
61
+ double_z: true
62
+ z_channels: 4
63
+ resolution: 256
64
+ in_channels: 3
65
+ out_ch: 3
66
+ ch: 128
67
+ ch_mult:
68
+ - 1
69
+ - 2
70
+ - 4
71
+ - 4
72
+ num_res_blocks: 2
73
+ attn_resolutions: []
74
+ dropout: 0.0
75
+ lossconfig:
76
+ target: torch.nn.Identity
77
+
78
+ cond_stage_config:
79
+ target: ldm.modules.encoders.modules.FrozenCLIPEmbedder