toto10 commited on Jul 30, 2023

Commit

ee4ceae

•

1 Parent(s): 47be066

27e42d28c71811068f519a9ffbc46cb633703ba3422ea495312037611c462237

Browse files

Files changed (33) hide show

extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/zoedepth_nk/__init__.py +31 -0
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/zoedepth_nk/config_zoedepth_nk.json +67 -0
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/zoedepth_nk/zoedepth_nk_v1.py +333 -0
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/utils/__init__.py +24 -0
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/utils/arg_utils.py +33 -0
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/utils/config.py +437 -0
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/utils/easydict/__init__.py +158 -0
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/utils/geometry.py +98 -0
extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/utils/misc.py +368 -0
extensions/microsoftexcel-controlnet/example/api_img2img.ipynb +105 -0
extensions/microsoftexcel-controlnet/example/api_txt2img.ipynb +104 -0
extensions/microsoftexcel-controlnet/example/chatgpt.py +676 -0
extensions/microsoftexcel-controlnet/example/visual_chatgpt.ipynb +60 -0
extensions/microsoftexcel-controlnet/extract_controlnet.py +27 -0
extensions/microsoftexcel-controlnet/extract_controlnet_diff.py +91 -0
extensions/microsoftexcel-controlnet/install.py +20 -0
extensions/microsoftexcel-controlnet/javascript/hints.js +17 -0
extensions/microsoftexcel-controlnet/models/cldm_v15.yaml +79 -0
extensions/microsoftexcel-controlnet/models/cldm_v21.yaml +85 -0
extensions/microsoftexcel-controlnet/models/control_sd15_canny.yaml +79 -0
extensions/microsoftexcel-controlnet/models/control_sd15_depth.yaml +79 -0
extensions/microsoftexcel-controlnet/models/control_sd15_hed.yaml +79 -0
extensions/microsoftexcel-controlnet/models/control_sd15_mlsd.yaml +79 -0
extensions/microsoftexcel-controlnet/models/control_sd15_normal.yaml +79 -0
extensions/microsoftexcel-controlnet/models/control_sd15_openpose.yaml +79 -0
extensions/microsoftexcel-controlnet/models/control_sd15_scribble.yaml +79 -0
extensions/microsoftexcel-controlnet/models/control_sd15_seg.yaml +79 -0
extensions/microsoftexcel-controlnet/models/control_v11e_sd15_ip2p.yaml +79 -0
extensions/microsoftexcel-controlnet/models/control_v11e_sd15_shuffle.yaml +80 -0
extensions/microsoftexcel-controlnet/models/control_v11f1e_sd15_tile.safetensors +3 -0
extensions/microsoftexcel-controlnet/models/control_v11f1e_sd15_tile.yaml +79 -0
extensions/microsoftexcel-controlnet/models/control_v11f1p_sd15_depth.safetensors +3 -0
extensions/microsoftexcel-controlnet/models/control_v11f1p_sd15_depth.yaml +79 -0

extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/zoedepth_nk/__init__.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# MIT License
+# Copyright (c) 2022 Intelligent Systems Lab Org
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# File author: Shariq Farooq Bhat
+from .zoedepth_nk_v1 import ZoeDepthNK
+all_versions = {
+    "v1": ZoeDepthNK,
+}
+get_version = lambda v : all_versions[v]

extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/zoedepth_nk/config_zoedepth_nk.json ADDED Viewed

	@@ -0,0 +1,67 @@

+{
+    "model": {
+        "name": "ZoeDepthNK",
+        "version_name": "v1",
+        "bin_conf" : [
+            {
+                "name": "nyu",
+                "n_bins": 64,
+                "min_depth": 1e-3,
+                "max_depth": 10.0
+            },
+            {
+                "name": "kitti",
+                "n_bins": 64,
+                "min_depth": 1e-3,
+                "max_depth": 80.0
+            }
+        ],
+        "bin_embedding_dim": 128,
+        "bin_centers_type": "softplus",
+        "n_attractors":[16, 8, 4, 1],
+        "attractor_alpha": 1000,
+        "attractor_gamma": 2,
+        "attractor_kind" : "mean",
+        "attractor_type" : "inv",
+        "min_temp": 0.0212,
+        "max_temp": 50.0,
+        "memory_efficient": true,
+        "midas_model_type" : "DPT_BEiT_L_384",
+        "img_size": [384, 512]
+    },
+    "train": {
+        "train_midas": true,
+        "use_pretrained_midas": true,
+        "trainer": "zoedepth_nk",
+        "epochs": 5,
+        "bs": 16,
+        "optim_kwargs": {"lr": 0.0002512, "wd": 0.01},
+        "sched_kwargs": {"div_factor": 1, "final_div_factor": 10000, "pct_start": 0.7, "three_phase":false, "cycle_momentum": true},
+        "same_lr": false,
+        "w_si": 1,
+        "w_domain": 100,
+        "avoid_boundary": false,
+        "random_crop": false,
+        "input_width": 640,
+        "input_height": 480,
+        "w_grad": 0,
+        "w_reg": 0,
+        "midas_lr_factor": 10,
+        "encoder_lr_factor":10,
+        "pos_enc_lr_factor":10
+    },
+    "infer": {
+        "train_midas": false,
+        "pretrained_resource": "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_NK.pt",
+        "use_pretrained_midas": false,
+        "force_keep_ar": true
+    },
+    "eval": {
+        "train_midas": false,
+        "pretrained_resource": "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_NK.pt",
+        "use_pretrained_midas": false
+    }
+}

extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/models/zoedepth_nk/zoedepth_nk_v1.py ADDED Viewed

	@@ -0,0 +1,333 @@

+# MIT License
+# Copyright (c) 2022 Intelligent Systems Lab Org
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# File author: Shariq Farooq Bhat
+import itertools
+import torch
+import torch.nn as nn
+from zoedepth.models.depth_model import DepthModel
+from zoedepth.models.base_models.midas import MidasCore
+from zoedepth.models.layers.attractor import AttractorLayer, AttractorLayerUnnormed
+from zoedepth.models.layers.dist_layers import ConditionalLogBinomial
+from zoedepth.models.layers.localbins_layers import (Projector, SeedBinRegressor,
+                                            SeedBinRegressorUnnormed)
+from zoedepth.models.layers.patch_transformer import PatchTransformerEncoder
+from zoedepth.models.model_io import load_state_from_resource
+class ZoeDepthNK(DepthModel):
+    def __init__(self, core,  bin_conf, bin_centers_type="softplus", bin_embedding_dim=128,
+                 n_attractors=[16, 8, 4, 1], attractor_alpha=300, attractor_gamma=2, attractor_kind='sum', attractor_type='exp',
+                 min_temp=5, max_temp=50,
+                 memory_efficient=False, train_midas=True,
+                 is_midas_pretrained=True, midas_lr_factor=1, encoder_lr_factor=10, pos_enc_lr_factor=10, inverse_midas=False,  **kwargs):
+        """ZoeDepthNK model. This is the version of ZoeDepth that has two metric heads and uses a learned router to route to experts.
+        Args:
+            core (models.base_models.midas.MidasCore): The base midas model that is used for extraction of "relative" features
+            bin_conf (List[dict]): A list of dictionaries that contain the bin configuration for each metric head. Each dictionary should contain the following keys:
+                                    "name" (str, typically same as the dataset name), "n_bins" (int), "min_depth" (float), "max_depth" (float)
+                                   The length of this list determines the number of metric heads.
+            bin_centers_type (str, optional): "normed" or "softplus". Activation type used for bin centers. For "normed" bin centers, linear normalization trick is applied. This results in bounded bin centers.
+                                               For "softplus", softplus activation is used and thus are unbounded. Defaults to "normed".
+            bin_embedding_dim (int, optional): bin embedding dimension. Defaults to 128.
+            n_attractors (List[int], optional): Number of bin attractors at decoder layers. Defaults to [16, 8, 4, 1].
+            attractor_alpha (int, optional): Proportional attractor strength. Refer to models.layers.attractor for more details. Defaults to 300.
+            attractor_gamma (int, optional): Exponential attractor strength. Refer to models.layers.attractor for more details. Defaults to 2.
+            attractor_kind (str, optional): Attraction aggregation "sum" or "mean". Defaults to 'sum'.
+            attractor_type (str, optional): Type of attractor to use; "inv" (Inverse attractor) or "exp" (Exponential attractor). Defaults to 'exp'.
+            min_temp (int, optional): Lower bound for temperature of output probability distribution. Defaults to 5.
+            max_temp (int, optional): Upper bound for temperature of output probability distribution. Defaults to 50.
+            memory_efficient (bool, optional): Whether to use memory efficient version of attractor layers. Memory efficient version is slower but is recommended incase of multiple metric heads in order save GPU memory. Defaults to False.
+            train_midas (bool, optional): Whether to train "core", the base midas model. Defaults to True.
+            is_midas_pretrained (bool, optional): Is "core" pretrained? Defaults to True.
+            midas_lr_factor (int, optional): Learning rate reduction factor for base midas model except its encoder and positional encodings. Defaults to 10.
+            encoder_lr_factor (int, optional): Learning rate reduction factor for the encoder in midas model. Defaults to 10.
+            pos_enc_lr_factor (int, optional): Learning rate reduction factor for positional encodings in the base midas model. Defaults to 10.
+        """
+        super().__init__()
+        self.core = core
+        self.bin_conf = bin_conf
+        self.min_temp = min_temp
+        self.max_temp = max_temp
+        self.memory_efficient = memory_efficient
+        self.train_midas = train_midas
+        self.is_midas_pretrained = is_midas_pretrained
+        self.midas_lr_factor = midas_lr_factor
+        self.encoder_lr_factor = encoder_lr_factor
+        self.pos_enc_lr_factor = pos_enc_lr_factor
+        self.inverse_midas = inverse_midas
+        N_MIDAS_OUT = 32
+        btlnck_features = self.core.output_channels[0]
+        num_out_features = self.core.output_channels[1:]
+        # self.scales = [16, 8, 4, 2]  # spatial scale factors
+        self.conv2 = nn.Conv2d(
+            btlnck_features, btlnck_features, kernel_size=1, stride=1, padding=0)
+        # Transformer classifier on the bottleneck
+        self.patch_transformer = PatchTransformerEncoder(
+            btlnck_features, 1, 128, use_class_token=True)
+        self.mlp_classifier = nn.Sequential(
+            nn.Linear(128, 128),
+            nn.ReLU(),
+            nn.Linear(128, 2)
+        )
+        if bin_centers_type == "normed":
+            SeedBinRegressorLayer = SeedBinRegressor
+            Attractor = AttractorLayer
+        elif bin_centers_type == "softplus":
+            SeedBinRegressorLayer = SeedBinRegressorUnnormed
+            Attractor = AttractorLayerUnnormed
+        elif bin_centers_type == "hybrid1":
+            SeedBinRegressorLayer = SeedBinRegressor
+            Attractor = AttractorLayerUnnormed
+        elif bin_centers_type == "hybrid2":
+            SeedBinRegressorLayer = SeedBinRegressorUnnormed
+            Attractor = AttractorLayer
+        else:
+            raise ValueError(
+                "bin_centers_type should be one of 'normed', 'softplus', 'hybrid1', 'hybrid2'")
+        self.bin_centers_type = bin_centers_type
+        # We have bins for each bin conf.
+        # Create a map (ModuleDict) of 'name' -> seed_bin_regressor
+        self.seed_bin_regressors = nn.ModuleDict(
+            {conf['name']: SeedBinRegressorLayer(btlnck_features, conf["n_bins"], mlp_dim=bin_embedding_dim//2, min_depth=conf["min_depth"], max_depth=conf["max_depth"])
+             for conf in bin_conf}
+        )
+        self.seed_projector = Projector(
+            btlnck_features, bin_embedding_dim, mlp_dim=bin_embedding_dim//2)
+        self.projectors = nn.ModuleList([
+            Projector(num_out, bin_embedding_dim, mlp_dim=bin_embedding_dim//2)
+            for num_out in num_out_features
+        ])
+        # Create a map (ModuleDict) of 'name' -> attractors (ModuleList)
+        self.attractors = nn.ModuleDict(
+            {conf['name']: nn.ModuleList([
+                Attractor(bin_embedding_dim, n_attractors[i],
+                          mlp_dim=bin_embedding_dim, alpha=attractor_alpha,
+                          gamma=attractor_gamma, kind=attractor_kind,
+                          attractor_type=attractor_type, memory_efficient=memory_efficient,
+                          min_depth=conf["min_depth"], max_depth=conf["max_depth"])
+                for i in range(len(n_attractors))
+            ])
+                for conf in bin_conf}
+        )
+        last_in = N_MIDAS_OUT
+        # conditional log binomial for each bin conf
+        self.conditional_log_binomial = nn.ModuleDict(
+            {conf['name']: ConditionalLogBinomial(last_in, bin_embedding_dim, conf['n_bins'], bottleneck_factor=4, min_temp=self.min_temp, max_temp=self.max_temp)
+             for conf in bin_conf}
+        )
+    def forward(self, x, return_final_centers=False, denorm=False, return_probs=False, **kwargs):
+        """
+        Args:
+            x (torch.Tensor): Input image tensor of shape (B, C, H, W). Assumes all images are from the same domain.
+            return_final_centers (bool, optional): Whether to return the final centers of the attractors. Defaults to False.
+            denorm (bool, optional): Whether to denormalize the input image. Defaults to False.
+            return_probs (bool, optional): Whether to return the probabilities of the bins. Defaults to False.
+        Returns:
+            dict: Dictionary of outputs with keys:
+                - "rel_depth": Relative depth map of shape (B, 1, H, W)
+                - "metric_depth": Metric depth map of shape (B, 1, H, W)
+                - "domain_logits": Domain logits of shape (B, 2)
+                - "bin_centers": Bin centers of shape (B, N, H, W). Present only if return_final_centers is True
+                - "probs": Bin probabilities of shape (B, N, H, W). Present only if return_probs is True
+        """
+        b, c, h, w = x.shape
+        self.orig_input_width = w
+        self.orig_input_height = h
+        rel_depth, out = self.core(x, denorm=denorm, return_rel_depth=True)
+        outconv_activation = out[0]
+        btlnck = out[1]
+        x_blocks = out[2:]
+        x_d0 = self.conv2(btlnck)
+        x = x_d0
+        # Predict which path to take
+        embedding = self.patch_transformer(x)[0]  # N, E
+        domain_logits = self.mlp_classifier(embedding)  # N, 2
+        domain_vote = torch.softmax(domain_logits.sum(
+            dim=0, keepdim=True), dim=-1)  # 1, 2
+        # Get the path
+        bin_conf_name = ["nyu", "kitti"][torch.argmax(
+            domain_vote, dim=-1).squeeze().item()]
+        try:
+            conf = [c for c in self.bin_conf if c.name == bin_conf_name][0]
+        except IndexError:
+            raise ValueError(
+                f"bin_conf_name {bin_conf_name} not found in bin_confs")
+        min_depth = conf['min_depth']
+        max_depth = conf['max_depth']
+        seed_bin_regressor = self.seed_bin_regressors[bin_conf_name]
+        _, seed_b_centers = seed_bin_regressor(x)
+        if self.bin_centers_type == 'normed' or self.bin_centers_type == 'hybrid2':
+            b_prev = (seed_b_centers - min_depth)/(max_depth - min_depth)
+        else:
+            b_prev = seed_b_centers
+        prev_b_embedding = self.seed_projector(x)
+        attractors = self.attractors[bin_conf_name]
+        for projector, attractor, x in zip(self.projectors, attractors, x_blocks):
+            b_embedding = projector(x)
+            b, b_centers = attractor(
+                b_embedding, b_prev, prev_b_embedding, interpolate=True)
+            b_prev = b
+            prev_b_embedding = b_embedding
+        last = outconv_activation
+        b_centers = nn.functional.interpolate(
+            b_centers, last.shape[-2:], mode='bilinear', align_corners=True)
+        b_embedding = nn.functional.interpolate(
+            b_embedding, last.shape[-2:], mode='bilinear', align_corners=True)
+        clb = self.conditional_log_binomial[bin_conf_name]
+        x = clb(last, b_embedding)
+        # Now depth value is Sum px * cx , where cx are bin_centers from the last bin tensor
+        # print(x.shape, b_centers.shape)
+        # b_centers = nn.functional.interpolate(b_centers, x.shape[-2:], mode='bilinear', align_corners=True)
+        out = torch.sum(x * b_centers, dim=1, keepdim=True)
+        output = dict(domain_logits=domain_logits, metric_depth=out)
+        if return_final_centers or return_probs:
+            output['bin_centers'] = b_centers
+        if return_probs:
+            output['probs'] = x
+        return output
+    def get_lr_params(self, lr):
+        """
+        Learning rate configuration for different layers of the model
+        Args:
+            lr (float) : Base learning rate
+        Returns:
+            list : list of parameters to optimize and their learning rates, in the format required by torch optimizers.
+        """
+        param_conf = []
+        if self.train_midas:
+            def get_rel_pos_params():
+                for name, p in self.core.core.pretrained.named_parameters():
+                    if "relative_position" in name:
+                        yield p
+            def get_enc_params_except_rel_pos():
+                for name, p in self.core.core.pretrained.named_parameters():
+                    if "relative_position" not in name:
+                        yield p
+            encoder_params = get_enc_params_except_rel_pos()
+            rel_pos_params = get_rel_pos_params()
+            midas_params = self.core.core.scratch.parameters()
+            midas_lr_factor = self.midas_lr_factor if self.is_midas_pretrained else 1.0
+            param_conf.extend([
+                {'params': encoder_params, 'lr': lr / self.encoder_lr_factor},
+                {'params': rel_pos_params, 'lr': lr / self.pos_enc_lr_factor},
+                {'params': midas_params, 'lr': lr / midas_lr_factor}
+            ])
+        remaining_modules = []
+        for name, child in self.named_children():
+            if name != 'core':
+                remaining_modules.append(child)
+        remaining_params = itertools.chain(
+            *[child.parameters() for child in remaining_modules])
+        param_conf.append({'params': remaining_params, 'lr': lr})
+        return param_conf
+    def get_conf_parameters(self, conf_name):
+        """
+        Returns parameters of all the ModuleDicts children that are exclusively used for the given bin configuration
+        """
+        params = []
+        for name, child in self.named_children():
+            if isinstance(child, nn.ModuleDict):
+                for bin_conf_name, module in child.items():
+                    if bin_conf_name == conf_name:
+                        params += list(module.parameters())
+        return params
+    def freeze_conf(self, conf_name):
+        """
+        Freezes all the parameters of all the ModuleDicts children that are exclusively used for the given bin configuration
+        """
+        for p in self.get_conf_parameters(conf_name):
+            p.requires_grad = False
+    def unfreeze_conf(self, conf_name):
+        """
+        Unfreezes all the parameters of all the ModuleDicts children that are exclusively used for the given bin configuration
+        """
+        for p in self.get_conf_parameters(conf_name):
+            p.requires_grad = True
+    def freeze_all_confs(self):
+        """
+        Freezes all the parameters of all the ModuleDicts children
+        """
+        for name, child in self.named_children():
+            if isinstance(child, nn.ModuleDict):
+                for bin_conf_name, module in child.items():
+                    for p in module.parameters():
+                        p.requires_grad = False
+    @staticmethod
+    def build(midas_model_type="DPT_BEiT_L_384", pretrained_resource=None, use_pretrained_midas=False, train_midas=False, freeze_midas_bn=True, **kwargs):
+        core = MidasCore.build(midas_model_type=midas_model_type, use_pretrained_midas=use_pretrained_midas,
+                               train_midas=train_midas, fetch_features=True, freeze_bn=freeze_midas_bn, **kwargs)
+        model = ZoeDepthNK(core, **kwargs)
+        if pretrained_resource:
+            assert isinstance(pretrained_resource, str), "pretrained_resource must be a string"
+            model = load_state_from_resource(model, pretrained_resource)
+        return model
+    @staticmethod
+    def build_from_config(config):
+        return ZoeDepthNK.build(**config)

extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,24 @@

+# MIT License
+# Copyright (c) 2022 Intelligent Systems Lab Org
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# File author: Shariq Farooq Bhat

extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/utils/arg_utils.py ADDED Viewed

	@@ -0,0 +1,33 @@

+def infer_type(x):  # hacky way to infer type from string args
+    if not isinstance(x, str):
+        return x
+    try:
+        x = int(x)
+        return x
+    except ValueError:
+        pass
+    try:
+        x = float(x)
+        return x
+    except ValueError:
+        pass
+    return x
+def parse_unknown(unknown_args):
+    clean = []
+    for a in unknown_args:
+        if "=" in a:
+            k, v = a.split("=")
+            clean.extend([k, v])
+        else:
+            clean.append(a)
+    keys = clean[::2]
+    values = clean[1::2]
+    return {k.replace("--", ""): infer_type(v) for k, v in zip(keys, values)}

extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/utils/config.py ADDED Viewed

	@@ -0,0 +1,437 @@

+# MIT License
+# Copyright (c) 2022 Intelligent Systems Lab Org
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# File author: Shariq Farooq Bhat
+import json
+import os
+from .easydict import EasyDict as edict
+from .arg_utils import infer_type
+import pathlib
+import platform
+ROOT = pathlib.Path(__file__).parent.parent.resolve()
+HOME_DIR = os.path.expanduser("~")
+COMMON_CONFIG = {
+    "save_dir": os.path.expanduser("~/shortcuts/monodepth3_checkpoints"),
+    "project": "ZoeDepth",
+    "tags": '',
+    "notes": "",
+    "gpu": None,
+    "root": ".",
+    "uid": None,
+    "print_losses": False
+}
+DATASETS_CONFIG = {
+    "kitti": {
+        "dataset": "kitti",
+        "min_depth": 0.001,
+        "max_depth": 80,
+        "data_path": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/raw"),
+        "gt_path": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/gts"),
+        "filenames_file": "./train_test_inputs/kitti_eigen_train_files_with_gt.txt",
+        "input_height": 352,
+        "input_width": 1216,  # 704
+        "data_path_eval": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/raw"),
+        "gt_path_eval": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/gts"),
+        "filenames_file_eval": "./train_test_inputs/kitti_eigen_test_files_with_gt.txt",
+        "min_depth_eval": 1e-3,
+        "max_depth_eval": 80,
+        "do_random_rotate": True,
+        "degree": 1.0,
+        "do_kb_crop": True,
+        "garg_crop": True,
+        "eigen_crop": False,
+        "use_right": False
+    },
+    "kitti_test": {
+        "dataset": "kitti",
+        "min_depth": 0.001,
+        "max_depth": 80,
+        "data_path": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/raw"),
+        "gt_path": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/gts"),
+        "filenames_file": "./train_test_inputs/kitti_eigen_train_files_with_gt.txt",
+        "input_height": 352,
+        "input_width": 1216,
+        "data_path_eval": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/raw"),
+        "gt_path_eval": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/gts"),
+        "filenames_file_eval": "./train_test_inputs/kitti_eigen_test_files_with_gt.txt",
+        "min_depth_eval": 1e-3,
+        "max_depth_eval": 80,
+        "do_random_rotate": False,
+        "degree": 1.0,
+        "do_kb_crop": True,
+        "garg_crop": True,
+        "eigen_crop": False,
+        "use_right": False
+    },
+    "nyu": {
+        "dataset": "nyu",
+        "avoid_boundary": False,
+        "min_depth": 1e-3,   # originally 0.1
+        "max_depth": 10,
+        "data_path": os.path.join(HOME_DIR, "shortcuts/datasets/nyu_depth_v2/sync/"),
+        "gt_path": os.path.join(HOME_DIR, "shortcuts/datasets/nyu_depth_v2/sync/"),
+        "filenames_file": "./train_test_inputs/nyudepthv2_train_files_with_gt.txt",
+        "input_height": 480,
+        "input_width": 640,
+        "data_path_eval": os.path.join(HOME_DIR, "shortcuts/datasets/nyu_depth_v2/official_splits/test/"),
+        "gt_path_eval": os.path.join(HOME_DIR, "shortcuts/datasets/nyu_depth_v2/official_splits/test/"),
+        "filenames_file_eval": "./train_test_inputs/nyudepthv2_test_files_with_gt.txt",
+        "min_depth_eval": 1e-3,
+        "max_depth_eval": 10,
+        "min_depth_diff": -10,
+        "max_depth_diff": 10,
+        "do_random_rotate": True,
+        "degree": 1.0,
+        "do_kb_crop": False,
+        "garg_crop": False,
+        "eigen_crop": True
+    },
+    "ibims": {
+        "dataset": "ibims",
+        "ibims_root": os.path.join(HOME_DIR, "shortcuts/datasets/ibims/ibims1_core_raw/"),
+        "eigen_crop": True,
+        "garg_crop": False,
+        "do_kb_crop": False,
+        "min_depth_eval": 0,
+        "max_depth_eval": 10,
+        "min_depth": 1e-3,
+        "max_depth": 10
+    },
+    "sunrgbd": {
+        "dataset": "sunrgbd",
+        "sunrgbd_root": os.path.join(HOME_DIR, "shortcuts/datasets/SUNRGBD/test/"),
+        "eigen_crop": True,
+        "garg_crop": False,
+        "do_kb_crop": False,
+        "min_depth_eval": 0,
+        "max_depth_eval": 8,
+        "min_depth": 1e-3,
+        "max_depth": 10
+    },
+    "diml_indoor": {
+        "dataset": "diml_indoor",
+        "diml_indoor_root": os.path.join(HOME_DIR, "shortcuts/datasets/diml_indoor_test/"),
+        "eigen_crop": True,
+        "garg_crop": False,
+        "do_kb_crop": False,
+        "min_depth_eval": 0,
+        "max_depth_eval": 10,
+        "min_depth": 1e-3,
+        "max_depth": 10
+    },
+    "diml_outdoor": {
+        "dataset": "diml_outdoor",
+        "diml_outdoor_root": os.path.join(HOME_DIR, "shortcuts/datasets/diml_outdoor_test/"),
+        "eigen_crop": False,
+        "garg_crop": True,
+        "do_kb_crop": False,
+        "min_depth_eval": 2,
+        "max_depth_eval": 80,
+        "min_depth": 1e-3,
+        "max_depth": 80
+    },
+    "diode_indoor": {
+        "dataset": "diode_indoor",
+        "diode_indoor_root": os.path.join(HOME_DIR, "shortcuts/datasets/diode_indoor/"),
+        "eigen_crop": True,
+        "garg_crop": False,
+        "do_kb_crop": False,
+        "min_depth_eval": 1e-3,
+        "max_depth_eval": 10,
+        "min_depth": 1e-3,
+        "max_depth": 10
+    },
+    "diode_outdoor": {
+        "dataset": "diode_outdoor",
+        "diode_outdoor_root": os.path.join(HOME_DIR, "shortcuts/datasets/diode_outdoor/"),
+        "eigen_crop": False,
+        "garg_crop": True,
+        "do_kb_crop": False,
+        "min_depth_eval": 1e-3,
+        "max_depth_eval": 80,
+        "min_depth": 1e-3,
+        "max_depth": 80
+    },
+    "hypersim_test": {
+        "dataset": "hypersim_test",
+        "hypersim_test_root": os.path.join(HOME_DIR, "shortcuts/datasets/hypersim_test/"),
+        "eigen_crop": True,
+        "garg_crop": False,
+        "do_kb_crop": False,
+        "min_depth_eval": 1e-3,
+        "max_depth_eval": 80,
+        "min_depth": 1e-3,
+        "max_depth": 10
+    },
+    "vkitti": {
+        "dataset": "vkitti",
+        "vkitti_root": os.path.join(HOME_DIR, "shortcuts/datasets/vkitti_test/"),
+        "eigen_crop": False,
+        "garg_crop": True,
+        "do_kb_crop": True,
+        "min_depth_eval": 1e-3,
+        "max_depth_eval": 80,
+        "min_depth": 1e-3,
+        "max_depth": 80
+    },
+    "vkitti2": {
+        "dataset": "vkitti2",
+        "vkitti2_root": os.path.join(HOME_DIR, "shortcuts/datasets/vkitti2/"),
+        "eigen_crop": False,
+        "garg_crop": True,
+        "do_kb_crop": True,
+        "min_depth_eval": 1e-3,
+        "max_depth_eval": 80,
+        "min_depth": 1e-3,
+        "max_depth": 80,
+    },
+    "ddad": {
+        "dataset": "ddad",
+        "ddad_root": os.path.join(HOME_DIR, "shortcuts/datasets/ddad/ddad_val/"),
+        "eigen_crop": False,
+        "garg_crop": True,
+        "do_kb_crop": True,
+        "min_depth_eval": 1e-3,
+        "max_depth_eval": 80,
+        "min_depth": 1e-3,
+        "max_depth": 80,
+    },
+}
+ALL_INDOOR = ["nyu", "ibims", "sunrgbd", "diode_indoor", "hypersim_test"]
+ALL_OUTDOOR = ["kitti", "diml_outdoor", "diode_outdoor",  "vkitti2", "ddad"]
+ALL_EVAL_DATASETS = ALL_INDOOR + ALL_OUTDOOR
+COMMON_TRAINING_CONFIG = {
+    "dataset": "nyu",
+    "distributed": True,
+    "workers": 16,
+    "clip_grad": 0.1,
+    "use_shared_dict": False,
+    "shared_dict": None,
+    "use_amp": False,
+    "aug": True,
+    "random_crop": False,
+    "random_translate": False,
+    "translate_prob": 0.2,
+    "max_translation": 100,
+    "validate_every": 0.25,
+    "log_images_every": 0.1,
+    "prefetch": False,
+}
+def flatten(config, except_keys=('bin_conf')):
+    def recurse(inp):
+        if isinstance(inp, dict):
+            for key, value in inp.items():
+                if key in except_keys:
+                    yield (key, value)
+                if isinstance(value, dict):
+                    yield from recurse(value)
+                else:
+                    yield (key, value)
+    return dict(list(recurse(config)))
+def split_combined_args(kwargs):
+    """Splits the arguments that are combined with '__' into multiple arguments.
+       Combined arguments should have equal number of keys and values.
+       Keys are separated by '__' and Values are separated with ';'.
+       For example, '__n_bins__lr=256;0.001'
+    Args:
+        kwargs (dict): key-value pairs of arguments where key-value is optionally combined according to the above format.
+    Returns:
+        dict: Parsed dict with the combined arguments split into individual key-value pairs.
+    """
+    new_kwargs = dict(kwargs)
+    for key, value in kwargs.items():
+        if key.startswith("__"):
+            keys = key.split("__")[1:]
+            values = value.split(";")
+            assert len(keys) == len(
+                values), f"Combined arguments should have equal number of keys and values. Keys are separated by '__' and Values are separated with ';'. For example, '__n_bins__lr=256;0.001. Given (keys,values) is ({keys}, {values})"
+            for k, v in zip(keys, values):
+                new_kwargs[k] = v
+    return new_kwargs
+def parse_list(config, key, dtype=int):
+    """Parse a list of values for the key if the value is a string. The values are separated by a comma.
+    Modifies the config in place.
+    """
+    if key in config:
+        if isinstance(config[key], str):
+            config[key] = list(map(dtype, config[key].split(',')))
+        assert isinstance(config[key], list) and all([isinstance(e, dtype) for e in config[key]]
+                                                     ), f"{key} should be a list of values dtype {dtype}. Given {config[key]} of type {type(config[key])} with values of type {[type(e) for e in config[key]]}."
+def get_model_config(model_name, model_version=None):
+    """Find and parse the .json config file for the model.
+    Args:
+        model_name (str): name of the model. The config file should be named config_{model_name}[_{model_version}].json under the models/{model_name} directory.
+        model_version (str, optional): Specific config version. If specified config_{model_name}_{model_version}.json is searched for and used. Otherwise config_{model_name}.json is used. Defaults to None.
+    Returns:
+        easydict: the config dictionary for the model.
+    """
+    config_fname = f"config_{model_name}_{model_version}.json" if model_version is not None else f"config_{model_name}.json"
+    config_file = os.path.join(ROOT, "models", model_name, config_fname)
+    if not os.path.exists(config_file):
+        return None
+    with open(config_file, "r") as f:
+        config = edict(json.load(f))
+    # handle dictionary inheritance
+    # only training config is supported for inheritance
+    if "inherit" in config.train and config.train.inherit is not None:
+        inherit_config = get_model_config(config.train["inherit"]).train
+        for key, value in inherit_config.items():
+            if key not in config.train:
+                config.train[key] = value
+    return edict(config)
+def update_model_config(config, mode, model_name, model_version=None, strict=False):
+    model_config = get_model_config(model_name, model_version)
+    if model_config is not None:
+        config = {**config, **
+                  flatten({**model_config.model, **model_config[mode]})}
+    elif strict:
+        raise ValueError(f"Config file for model {model_name} not found.")
+    return config
+def check_choices(name, value, choices):
+    # return  # No checks in dev branch
+    if value not in choices:
+        raise ValueError(f"{name} {value} not in supported choices {choices}")
+KEYS_TYPE_BOOL = ["use_amp", "distributed", "use_shared_dict", "same_lr", "aug", "three_phase",
+                  "prefetch", "cycle_momentum"]  # Casting is not necessary as their int casted values in config are 0 or 1
+def get_config(model_name, mode='train', dataset=None, **overwrite_kwargs):
+    """Main entry point to get the config for the model.
+    Args:
+        model_name (str): name of the desired model.
+        mode (str, optional): "train" or "infer". Defaults to 'train'.
+        dataset (str, optional): If specified, the corresponding dataset configuration is loaded as well. Defaults to None.
+    Keyword Args: key-value pairs of arguments to overwrite the default config.
+    The order of precedence for overwriting the config is (Higher precedence first):
+        # 1. overwrite_kwargs
+        # 2. "config_version": Config file version if specified in overwrite_kwargs. The corresponding config loaded is config_{model_name}_{config_version}.json
+        # 3. "version_name": Default Model version specific config specified in overwrite_kwargs. The corresponding config loaded is config_{model_name}_{version_name}.json
+        # 4. common_config: Default config for all models specified in COMMON_CONFIG
+    Returns:
+        easydict: The config dictionary for the model.
+    """
+    check_choices("Model", model_name, ["zoedepth", "zoedepth_nk"])
+    check_choices("Mode", mode, ["train", "infer", "eval"])
+    if mode == "train":
+        check_choices("Dataset", dataset, ["nyu", "kitti", "mix", None])
+    config = flatten({**COMMON_CONFIG, **COMMON_TRAINING_CONFIG})
+    config = update_model_config(config, mode, model_name)
+    # update with model version specific config
+    version_name = overwrite_kwargs.get("version_name", config["version_name"])
+    config = update_model_config(config, mode, model_name, version_name)
+    # update with config version if specified
+    config_version = overwrite_kwargs.get("config_version", None)
+    if config_version is not None:
+        print("Overwriting config with config_version", config_version)
+        config = update_model_config(config, mode, model_name, config_version)
+    # update with overwrite_kwargs
+    # Combined args are useful for hyperparameter search
+    overwrite_kwargs = split_combined_args(overwrite_kwargs)
+    config = {**config, **overwrite_kwargs}
+    # Casting to bool   # TODO: Not necessary. Remove and test
+    for key in KEYS_TYPE_BOOL:
+        if key in config:
+            config[key] = bool(config[key])
+    # Model specific post processing of config
+    parse_list(config, "n_attractors")
+    # adjust n_bins for each bin configuration if bin_conf is given and n_bins is passed in overwrite_kwargs
+    if 'bin_conf' in config and 'n_bins' in overwrite_kwargs:
+        bin_conf = config['bin_conf']  # list of dicts
+        n_bins = overwrite_kwargs['n_bins']
+        new_bin_conf = []
+        for conf in bin_conf:
+            conf['n_bins'] = n_bins
+            new_bin_conf.append(conf)
+        config['bin_conf'] = new_bin_conf
+    if mode == "train":
+        orig_dataset = dataset
+        if dataset == "mix":
+            dataset = 'nyu'  # Use nyu as default for mix. Dataset config is changed accordingly while loading the dataloader
+        if dataset is not None:
+            config['project'] = f"MonoDepth3-{orig_dataset}"  # Set project for wandb
+    if dataset is not None:
+        config['dataset'] = dataset
+        config = {**DATASETS_CONFIG[dataset], **config}
+    config['model'] = model_name
+    typed_config = {k: infer_type(v) for k, v in config.items()}
+    # add hostname to config
+    config['hostname'] = platform.node()
+    return edict(typed_config)
+def change_dataset(config, new_dataset):
+    config.update(DATASETS_CONFIG[new_dataset])
+    return config

extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/utils/easydict/__init__.py ADDED Viewed

	@@ -0,0 +1,158 @@

+"""
+EasyDict
+Copy/pasted from https://github.com/makinacorpus/easydict
+Original author: Mathieu Leplatre <mathieu.leplatre@makina-corpus.com>
+"""
+class EasyDict(dict):
+    """
+    Get attributes
+    >>> d = EasyDict({'foo':3})
+    >>> d['foo']
+    3
+    >>> d.foo
+    3
+    >>> d.bar
+    Traceback (most recent call last):
+    ...
+    AttributeError: 'EasyDict' object has no attribute 'bar'
+    Works recursively
+    >>> d = EasyDict({'foo':3, 'bar':{'x':1, 'y':2}})
+    >>> isinstance(d.bar, dict)
+    True
+    >>> d.bar.x
+    1
+    Bullet-proof
+    >>> EasyDict({})
+    {}
+    >>> EasyDict(d={})
+    {}
+    >>> EasyDict(None)
+    {}
+    >>> d = {'a': 1}
+    >>> EasyDict(**d)
+    {'a': 1}
+    >>> EasyDict((('a', 1), ('b', 2)))
+    {'a': 1, 'b': 2}
+    Set attributes
+    >>> d = EasyDict()
+    >>> d.foo = 3
+    >>> d.foo
+    3
+    >>> d.bar = {'prop': 'value'}
+    >>> d.bar.prop
+    'value'
+    >>> d
+    {'foo': 3, 'bar': {'prop': 'value'}}
+    >>> d.bar.prop = 'newer'
+    >>> d.bar.prop
+    'newer'
+    Values extraction
+    >>> d = EasyDict({'foo':0, 'bar':[{'x':1, 'y':2}, {'x':3, 'y':4}]})
+    >>> isinstance(d.bar, list)
+    True
+    >>> from operator import attrgetter
+    >>> list(map(attrgetter('x'), d.bar))
+    [1, 3]
+    >>> list(map(attrgetter('y'), d.bar))
+    [2, 4]
+    >>> d = EasyDict()
+    >>> list(d.keys())
+    []
+    >>> d = EasyDict(foo=3, bar=dict(x=1, y=2))
+    >>> d.foo
+    3
+    >>> d.bar.x
+    1
+    Still like a dict though
+    >>> o = EasyDict({'clean':True})
+    >>> list(o.items())
+    [('clean', True)]
+    And like a class
+    >>> class Flower(EasyDict):
+    ...     power = 1
+    ...
+    >>> f = Flower()
+    >>> f.power
+    1
+    >>> f = Flower({'height': 12})
+    >>> f.height
+    12
+    >>> f['power']
+    1
+    >>> sorted(f.keys())
+    ['height', 'power']
+    update and pop items
+    >>> d = EasyDict(a=1, b='2')
+    >>> e = EasyDict(c=3.0, a=9.0)
+    >>> d.update(e)
+    >>> d.c
+    3.0
+    >>> d['c']
+    3.0
+    >>> d.get('c')
+    3.0
+    >>> d.update(a=4, b=4)
+    >>> d.b
+    4
+    >>> d.pop('a')
+    4
+    >>> d.a
+    Traceback (most recent call last):
+    ...
+    AttributeError: 'EasyDict' object has no attribute 'a'
+    """
+    def __init__(self, d=None, **kwargs):
+        if d is None:
+            d = {}
+        else:
+            d = dict(d)
+        if kwargs:
+            d.update(**kwargs)
+        for k, v in d.items():
+            setattr(self, k, v)
+        # Class attributes
+        for k in self.__class__.__dict__.keys():
+            if not (k.startswith('__') and k.endswith('__')) and not k in ('update', 'pop'):
+                setattr(self, k, getattr(self, k))
+    def __setattr__(self, name, value):
+        if isinstance(value, (list, tuple)):
+            value = [self.__class__(x)
+                     if isinstance(x, dict) else x for x in value]
+        elif isinstance(value, dict) and not isinstance(value, self.__class__):
+            value = self.__class__(value)
+        super(EasyDict, self).__setattr__(name, value)
+        super(EasyDict, self).__setitem__(name, value)
+    __setitem__ = __setattr__
+    def update(self, e=None, **f):
+        d = e or dict()
+        d.update(f)
+        for k in d:
+            setattr(self, k, d[k])
+    def pop(self, k, d=None):
+        delattr(self, k)
+        return super(EasyDict, self).pop(k, d)
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()

extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/utils/geometry.py ADDED Viewed

	@@ -0,0 +1,98 @@

+# MIT License
+# Copyright (c) 2022 Intelligent Systems Lab Org
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# File author: Shariq Farooq Bhat
+import numpy as np
+def get_intrinsics(H,W):
+    """
+    Intrinsics for a pinhole camera model.
+    Assume fov of 55 degrees and central principal point.
+    """
+    f = 0.5 * W / np.tan(0.5 * 55 * np.pi / 180.0)
+    cx = 0.5 * W
+    cy = 0.5 * H
+    return np.array([[f, 0, cx],
+                     [0, f, cy],
+                     [0, 0, 1]])
+def depth_to_points(depth, R=None, t=None):
+    K = get_intrinsics(depth.shape[1], depth.shape[2])
+    Kinv = np.linalg.inv(K)
+    if R is None:
+        R = np.eye(3)
+    if t is None:
+        t = np.zeros(3)
+    # M converts from your coordinate to PyTorch3D's coordinate system
+    M = np.eye(3)
+    M[0, 0] = -1.0
+    M[1, 1] = -1.0
+    height, width = depth.shape[1:3]
+    x = np.arange(width)
+    y = np.arange(height)
+    coord = np.stack(np.meshgrid(x, y), -1)
+    coord = np.concatenate((coord, np.ones_like(coord)[:, :, [0]]), -1)  # z=1
+    coord = coord.astype(np.float32)
+    # coord = torch.as_tensor(coord, dtype=torch.float32, device=device)
+    coord = coord[None]  # bs, h, w, 3
+    D = depth[:, :, :, None, None]
+    # print(D.shape, Kinv[None, None, None, ...].shape, coord[:, :, :, :, None].shape )
+    pts3D_1 = D * Kinv[None, None, None, ...] @ coord[:, :, :, :, None]
+    # pts3D_1 live in your coordinate system. Convert them to Py3D's
+    pts3D_1 = M[None, None, None, ...] @ pts3D_1
+    # from reference to targe tviewpoint
+    pts3D_2 = R[None, None, None, ...] @ pts3D_1 + t[None, None, None, :, None]
+    # pts3D_2 = pts3D_1
+    # depth_2 = pts3D_2[:, :, :, 2, :]  # b,1,h,w
+    return pts3D_2[:, :, :, :3, 0][0]
+def create_triangles(h, w, mask=None):
+    """
+    Reference: https://github.com/google-research/google-research/blob/e96197de06613f1b027d20328e06d69829fa5a89/infinite_nature/render_utils.py#L68
+    Creates mesh triangle indices from a given pixel grid size.
+        This function is not and need not be differentiable as triangle indices are
+        fixed.
+    Args:
+    h: (int) denoting the height of the image.
+    w: (int) denoting the width of the image.
+    Returns:
+    triangles: 2D numpy array of indices (int) with shape (2(W-1)(H-1) x 3)
+    """
+    x, y = np.meshgrid(range(w - 1), range(h - 1))
+    tl = y * w + x
+    tr = y * w + x + 1
+    bl = (y + 1) * w + x
+    br = (y + 1) * w + x + 1
+    triangles = np.array([tl, bl, tr, br, tr, bl])
+    triangles = np.transpose(triangles, (1, 2, 0)).reshape(
+        ((w - 1) * (h - 1) * 2, 3))
+    if mask is not None:
+        mask = mask.reshape(-1)
+        triangles = triangles[mask[triangles].all(1)]
+    return triangles

extensions/microsoftexcel-controlnet/annotator/zoe/zoedepth/utils/misc.py ADDED Viewed

	@@ -0,0 +1,368 @@

+# MIT License
+# Copyright (c) 2022 Intelligent Systems Lab Org
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# File author: Shariq Farooq Bhat
+"""Miscellaneous utility functions."""
+from scipy import ndimage
+import base64
+import math
+import re
+from io import BytesIO
+import matplotlib
+import matplotlib.cm
+import numpy as np
+import requests
+import torch
+import torch.distributed as dist
+import torch.nn
+import torch.nn as nn
+import torch.utils.data.distributed
+from PIL import Image
+from torchvision.transforms import ToTensor
+class RunningAverage:
+    def __init__(self):
+        self.avg = 0
+        self.count = 0
+    def append(self, value):
+        self.avg = (value + self.count * self.avg) / (self.count + 1)
+        self.count += 1
+    def get_value(self):
+        return self.avg
+def denormalize(x):
+    """Reverses the imagenet normalization applied to the input.
+    Args:
+        x (torch.Tensor - shape(N,3,H,W)): input tensor
+    Returns:
+        torch.Tensor - shape(N,3,H,W): Denormalized input
+    """
+    mean = torch.Tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1).to(x.device)
+    std = torch.Tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1).to(x.device)
+    return x * std + mean
+class RunningAverageDict:
+    """A dictionary of running averages."""
+    def __init__(self):
+        self._dict = None
+    def update(self, new_dict):
+        if new_dict is None:
+            return
+        if self._dict is None:
+            self._dict = dict()
+            for key, value in new_dict.items():
+                self._dict[key] = RunningAverage()
+        for key, value in new_dict.items():
+            self._dict[key].append(value)
+    def get_value(self):
+        if self._dict is None:
+            return None
+        return {key: value.get_value() for key, value in self._dict.items()}
+def colorize(value, vmin=None, vmax=None, cmap='gray_r', invalid_val=-99, invalid_mask=None, background_color=(128, 128, 128, 255), gamma_corrected=False, value_transform=None):
+    """Converts a depth map to a color image.
+    Args:
+        value (torch.Tensor, numpy.ndarry): Input depth map. Shape: (H, W) or (1, H, W) or (1, 1, H, W). All singular dimensions are squeezed
+        vmin (float, optional): vmin-valued entries are mapped to start color of cmap. If None, value.min() is used. Defaults to None.
+        vmax (float, optional):  vmax-valued entries are mapped to end color of cmap. If None, value.max() is used. Defaults to None.
+        cmap (str, optional): matplotlib colormap to use. Defaults to 'magma_r'.
+        invalid_val (int, optional): Specifies value of invalid pixels that should be colored as 'background_color'. Defaults to -99.
+        invalid_mask (numpy.ndarray, optional): Boolean mask for invalid regions. Defaults to None.
+        background_color (tuple[int], optional): 4-tuple RGB color to give to invalid pixels. Defaults to (128, 128, 128, 255).
+        gamma_corrected (bool, optional): Apply gamma correction to colored image. Defaults to False.
+        value_transform (Callable, optional): Apply transform function to valid pixels before coloring. Defaults to None.
+    Returns:
+        numpy.ndarray, dtype - uint8: Colored depth map. Shape: (H, W, 4)
+    """
+    if isinstance(value, torch.Tensor):
+        value = value.detach().cpu().numpy()
+    value = value.squeeze()
+    if invalid_mask is None:
+        invalid_mask = value == invalid_val
+    mask = np.logical_not(invalid_mask)
+    # normalize
+    vmin = np.percentile(value[mask],2) if vmin is None else vmin
+    vmax = np.percentile(value[mask],85) if vmax is None else vmax
+    if vmin != vmax:
+        value = (value - vmin) / (vmax - vmin)  # vmin..vmax
+    else:
+        # Avoid 0-division
+        value = value * 0.
+    # squeeze last dim if it exists
+    # grey out the invalid values
+    value[invalid_mask] = np.nan
+    cmapper = matplotlib.cm.get_cmap(cmap)
+    if value_transform:
+        value = value_transform(value)
+        # value = value / value.max()
+    value = cmapper(value, bytes=True)  # (nxmx4)
+    # img = value[:, :, :]
+    img = value[...]
+    img[invalid_mask] = background_color
+    #     return img.transpose((2, 0, 1))
+    if gamma_corrected:
+        # gamma correction
+        img = img / 255
+        img = np.power(img, 2.2)
+        img = img * 255
+        img = img.astype(np.uint8)
+    return img
+def count_parameters(model, include_all=False):
+    return sum(p.numel() for p in model.parameters() if p.requires_grad or include_all)
+def compute_errors(gt, pred):
+    """Compute metrics for 'pred' compared to 'gt'
+    Args:
+        gt (numpy.ndarray): Ground truth values
+        pred (numpy.ndarray): Predicted values
+        gt.shape should be equal to pred.shape
+    Returns:
+        dict: Dictionary containing the following metrics:
+            'a1': Delta1 accuracy: Fraction of pixels that are within a scale factor of 1.25
+            'a2': Delta2 accuracy: Fraction of pixels that are within a scale factor of 1.25^2
+            'a3': Delta3 accuracy: Fraction of pixels that are within a scale factor of 1.25^3
+            'abs_rel': Absolute relative error
+            'rmse': Root mean squared error
+            'log_10': Absolute log10 error
+            'sq_rel': Squared relative error
+            'rmse_log': Root mean squared error on the log scale
+            'silog': Scale invariant log error
+    """
+    thresh = np.maximum((gt / pred), (pred / gt))
+    a1 = (thresh < 1.25).mean()
+    a2 = (thresh < 1.25 ** 2).mean()
+    a3 = (thresh < 1.25 ** 3).mean()
+    abs_rel = np.mean(np.abs(gt - pred) / gt)
+    sq_rel = np.mean(((gt - pred) ** 2) / gt)
+    rmse = (gt - pred) ** 2
+    rmse = np.sqrt(rmse.mean())
+    rmse_log = (np.log(gt) - np.log(pred)) ** 2
+    rmse_log = np.sqrt(rmse_log.mean())
+    err = np.log(pred) - np.log(gt)
+    silog = np.sqrt(np.mean(err ** 2) - np.mean(err) ** 2) * 100
+    log_10 = (np.abs(np.log10(gt) - np.log10(pred))).mean()
+    return dict(a1=a1, a2=a2, a3=a3, abs_rel=abs_rel, rmse=rmse, log_10=log_10, rmse_log=rmse_log,
+                silog=silog, sq_rel=sq_rel)
+def compute_metrics(gt, pred, interpolate=True, garg_crop=False, eigen_crop=True, dataset='nyu', min_depth_eval=0.1, max_depth_eval=10, **kwargs):
+    """Compute metrics of predicted depth maps. Applies cropping and masking as necessary or specified via arguments. Refer to compute_errors for more details on metrics.
+    """
+    if 'config' in kwargs:
+        config = kwargs['config']
+        garg_crop = config.garg_crop
+        eigen_crop = config.eigen_crop
+        min_depth_eval = config.min_depth_eval
+        max_depth_eval = config.max_depth_eval
+    if gt.shape[-2:] != pred.shape[-2:] and interpolate:
+        pred = nn.functional.interpolate(
+            pred, gt.shape[-2:], mode='bilinear', align_corners=True)
+    pred = pred.squeeze().cpu().numpy()
+    pred[pred < min_depth_eval] = min_depth_eval
+    pred[pred > max_depth_eval] = max_depth_eval
+    pred[np.isinf(pred)] = max_depth_eval
+    pred[np.isnan(pred)] = min_depth_eval
+    gt_depth = gt.squeeze().cpu().numpy()
+    valid_mask = np.logical_and(
+        gt_depth > min_depth_eval, gt_depth < max_depth_eval)
+    if garg_crop or eigen_crop:
+        gt_height, gt_width = gt_depth.shape
+        eval_mask = np.zeros(valid_mask.shape)
+        if garg_crop:
+            eval_mask[int(0.40810811 * gt_height):int(0.99189189 * gt_height),
+                      int(0.03594771 * gt_width):int(0.96405229 * gt_width)] = 1
+        elif eigen_crop:
+            # print("-"*10, " EIGEN CROP ", "-"*10)
+            if dataset == 'kitti':
+                eval_mask[int(0.3324324 * gt_height):int(0.91351351 * gt_height),
+                          int(0.0359477 * gt_width):int(0.96405229 * gt_width)] = 1
+            else:
+                # assert gt_depth.shape == (480, 640), "Error: Eigen crop is currently only valid for (480, 640) images"
+                eval_mask[45:471, 41:601] = 1
+        else:
+            eval_mask = np.ones(valid_mask.shape)
+    valid_mask = np.logical_and(valid_mask, eval_mask)
+    return compute_errors(gt_depth[valid_mask], pred[valid_mask])
+#################################### Model uilts ################################################
+def parallelize(config, model, find_unused_parameters=True):
+    if config.gpu is not None:
+        torch.cuda.set_device(config.gpu)
+        model = model.cuda(config.gpu)
+    config.multigpu = False
+    if config.distributed:
+        # Use DDP
+        config.multigpu = True
+        config.rank = config.rank * config.ngpus_per_node + config.gpu
+        dist.init_process_group(backend=config.dist_backend, init_method=config.dist_url,
+                                world_size=config.world_size, rank=config.rank)
+        config.batch_size = int(config.batch_size / config.ngpus_per_node)
+        # config.batch_size = 8
+        config.workers = int(
+            (config.num_workers + config.ngpus_per_node - 1) / config.ngpus_per_node)
+        print("Device", config.gpu, "Rank",  config.rank, "batch size",
+              config.batch_size, "Workers", config.workers)
+        torch.cuda.set_device(config.gpu)
+        model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
+        model = model.cuda(config.gpu)
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[config.gpu], output_device=config.gpu,
+                                                          find_unused_parameters=find_unused_parameters)
+    elif config.gpu is None:
+        # Use DP
+        config.multigpu = True
+        model = model.cuda()
+        model = torch.nn.DataParallel(model)
+    return model
+#################################################################################################
+#####################################################################################################
+class colors:
+    '''Colors class:
+    Reset all colors with colors.reset
+    Two subclasses fg for foreground and bg for background.
+    Use as colors.subclass.colorname.
+    i.e. colors.fg.red or colors.bg.green
+    Also, the generic bold, disable, underline, reverse, strikethrough,
+    and invisible work with the main class
+    i.e. colors.bold
+    '''
+    reset = '\033[0m'
+    bold = '\033[01m'
+    disable = '\033[02m'
+    underline = '\033[04m'
+    reverse = '\033[07m'
+    strikethrough = '\033[09m'
+    invisible = '\033[08m'
+    class fg:
+        black = '\033[30m'
+        red = '\033[31m'
+        green = '\033[32m'
+        orange = '\033[33m'
+        blue = '\033[34m'
+        purple = '\033[35m'
+        cyan = '\033[36m'
+        lightgrey = '\033[37m'
+        darkgrey = '\033[90m'
+        lightred = '\033[91m'
+        lightgreen = '\033[92m'
+        yellow = '\033[93m'
+        lightblue = '\033[94m'
+        pink = '\033[95m'
+        lightcyan = '\033[96m'
+    class bg:
+        black = '\033[40m'
+        red = '\033[41m'
+        green = '\033[42m'
+        orange = '\033[43m'
+        blue = '\033[44m'
+        purple = '\033[45m'
+        cyan = '\033[46m'
+        lightgrey = '\033[47m'
+def printc(text, color):
+    print(f"{color}{text}{colors.reset}")
+############################################
+def get_image_from_url(url):
+    response = requests.get(url)
+    img = Image.open(BytesIO(response.content)).convert("RGB")
+    return img
+def url_to_torch(url, size=(384, 384)):
+    img = get_image_from_url(url)
+    img = img.resize(size, Image.ANTIALIAS)
+    img = torch.from_numpy(np.asarray(img)).float()
+    img = img.permute(2, 0, 1)
+    img.div_(255)
+    return img
+def pil_to_batched_tensor(img):
+    return ToTensor()(img).unsqueeze(0)
+def save_raw_16bit(depth, fpath="raw.png"):
+    if isinstance(depth, torch.Tensor):
+        depth = depth.squeeze().cpu().numpy()
+    assert isinstance(depth, np.ndarray), "Depth must be a torch tensor or numpy array"
+    assert depth.ndim == 2, "Depth must be 2D"
+    depth = depth * 256  # scale for 16-bit png
+    depth = depth.astype(np.uint16)
+    depth = Image.fromarray(depth)
+    depth.save(fpath)
+    print("Saved raw depth to", fpath)

extensions/microsoftexcel-controlnet/example/api_img2img.ipynb ADDED Viewed

	@@ -0,0 +1,105 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# controlnet + img2img\n",
+    "# enable `Allow other script to control this extension` in settings\n",
+    "\n",
+    "import requests\n",
+    "import cv2\n",
+    "from base64 import b64encode\n",
+    "\n",
+    "def readImage(path):\n",
+    "    img = cv2.imread(path)\n",
+    "    retval, buffer = cv2.imencode('.jpg', img)\n",
+    "    b64img = b64encode(buffer).decode(\"utf-8\")\n",
+    "    return b64img\n",
+    "\n",
+    "b64img = readImage(\"/root/workspace/nahida/0e17302b9bfa15402f783c29c0d1d34f.jpg\")\n",
+    "\n",
+    "class controlnetRequest():\n",
+    "    def __init__(self, prompt):\n",
+    "        self.url = \"http://localhost:7860/controlnet/img2img\"\n",
+    "        self.body = {\n",
+    "            \"init_images\": [b64img],\n",
+    "            \"prompt\": prompt,\n",
+    "            \"negative_prompt\": \"\",\n",
+    "            \"seed\": -1,\n",
+    "            \"subseed\": -1,\n",
+    "            \"subseed_strength\": 0,\n",
+    "            \"batch_size\": 1,\n",
+    "            \"n_iter\": 1,\n",
+    "            \"steps\": 20,\n",
+    "            \"cfg_scale\": 7,\n",
+    "            \"width\": 512,\n",
+    "            \"height\": 768,\n",
+    "            \"restore_faces\": True,\n",
+    "            \"eta\": 0,\n",
+    "            \"sampler_index\": \"Euler a\",\n",
+    "            \"controlnet_input_image\": [b64img],\n",
+    "            \"controlnet_module\": 'canny',\n",
+    "            \"controlnet_model\": 'control_canny-fp16 [e3fe7712]',\n",
+    "            \"controlnet_guidance\": 1.0,\n",
+    "        }\n",
+    "\n",
+    "    def sendRequest(self):\n",
+    "        r = requests.post(self.url, json=self.body)\n",
+    "        return r.json()\n",
+    "\n",
+    "js = controlnetRequest(\"walter white\").sendRequest()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import io, base64\n",
+    "import matplotlib.pyplot as plt\n",
+    "from PIL import Image\n",
+    "\n",
+    "pil_img = Image.open('/root/workspace/nahida/0e17302b9bfa15402f783c29c0d1d34f.jpg')\n",
+    "image = Image.open(io.BytesIO(base64.b64decode(js[\"images\"][0])))\n",
+    "mask_image = Image.open(io.BytesIO(base64.b64decode(js[\"images\"][1])))\n",
+    "\n",
+    "plt.figure()\n",
+    "f, axarr = plt.subplots(1,3) \n",
+    "axarr[0].imshow(pil_img)   \n",
+    "axarr[1].imshow(image)  \n",
+    "axarr[2].imshow(mask_image)  "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pynb",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "d73345514d8c18d9a1da7351d222dbd2834c7f4a09e728a0d1f4c4580fbec206"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

extensions/microsoftexcel-controlnet/example/api_txt2img.ipynb ADDED Viewed

	@@ -0,0 +1,104 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# controlnet + txt2img\n",
+    "# enable `Allow other script to control this extension` in settings\n",
+    "\n",
+    "import requests\n",
+    "import cv2\n",
+    "from base64 import b64encode\n",
+    "\n",
+    "def readImage(path):\n",
+    "    img = cv2.imread(path)\n",
+    "    retval, buffer = cv2.imencode('.jpg', img)\n",
+    "    b64img = b64encode(buffer).decode(\"utf-8\")\n",
+    "    return b64img\n",
+    "\n",
+    "b64img = readImage(\"/root/workspace/nahida/0e17302b9bfa15402f783c29c0d1d34f.jpg\")\n",
+    "\n",
+    "class controlnetRequest():\n",
+    "    def __init__(self, prompt):\n",
+    "        self.url = \"http://localhost:7860/controlnet/txt2img\"\n",
+    "        self.body = {\n",
+    "            \"prompt\": prompt,\n",
+    "            \"negative_prompt\": \"\",\n",
+    "            \"seed\": -1,\n",
+    "            \"subseed\": -1,\n",
+    "            \"subseed_strength\": 0,\n",
+    "            \"batch_size\": 1,\n",
+    "            \"n_iter\": 1,\n",
+    "            \"steps\": 15,\n",
+    "            \"cfg_scale\": 7,\n",
+    "            \"width\": 512,\n",
+    "            \"height\": 768,\n",
+    "            \"restore_faces\": True,\n",
+    "            \"eta\": 0,\n",
+    "            \"sampler_index\": \"Euler a\",\n",
+    "            \"controlnet_input_image\": [b64img],\n",
+    "            \"controlnet_module\": 'canny',\n",
+    "            \"controlnet_model\": 'control_canny-fp16 [e3fe7712]',\n",
+    "            \"controlnet_guidance\": 1.0,\n",
+    "        }\n",
+    "\n",
+    "    def sendRequest(self):\n",
+    "        r = requests.post(self.url, json=self.body)\n",
+    "        return r.json()\n",
+    "\n",
+    "js = controlnetRequest(\"walter white\").sendRequest()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import io, base64\n",
+    "import matplotlib.pyplot as plt\n",
+    "from PIL import Image\n",
+    "\n",
+    "pil_img = Image.open('/root/workspace/nahida/0e17302b9bfa15402f783c29c0d1d34f.jpg')\n",
+    "image = Image.open(io.BytesIO(base64.b64decode(js[\"images\"][0])))\n",
+    "mask_image = Image.open(io.BytesIO(base64.b64decode(js[\"images\"][1])))\n",
+    "\n",
+    "plt.figure()\n",
+    "f, axarr = plt.subplots(1,3) \n",
+    "axarr[0].imshow(pil_img)   \n",
+    "axarr[1].imshow(image)  \n",
+    "axarr[2].imshow(mask_image)  "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pynb",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "d73345514d8c18d9a1da7351d222dbd2834c7f4a09e728a0d1f4c4580fbec206"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

extensions/microsoftexcel-controlnet/example/chatgpt.py ADDED Viewed

	@@ -0,0 +1,676 @@

+import os
+import re
+import uuid
+import cv2
+import torch
+import requests
+import io, base64
+import numpy as np
+import gradio as gr
+from PIL import Image
+from base64 import b64encode
+from omegaconf import OmegaConf
+from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration, BlipForQuestionAnswering
+from transformers import AutoModelForCausalLM, AutoTokenizer, CLIPSegProcessor, CLIPSegForImageSegmentation
+from langchain.agents.initialize import initialize_agent
+from langchain.agents.tools import Tool
+from langchain.chains.conversation.memory import ConversationBufferMemory
+from langchain.llms.openai import OpenAI
+VISUAL_CHATGPT_PREFIX = """Visual ChatGPT is designed to be able to assist with a wide range of text and visual related tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of topics. Visual ChatGPT is able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.
+Visual ChatGPT is able to process and understand large amounts of text and images. As a language model, Visual ChatGPT can not directly read images, but it has a list of tools to finish different visual tasks. Each image will have a file name formed as "image/xxx.png", and Visual ChatGPT can invoke different tools to indirectly understand pictures. When talking about images, Visual ChatGPT is very strict to the file name and will never fabricate nonexistent files. When using tools to generate new image files, Visual ChatGPT is also known that the image may not be the same as the user's demand, and will use other visual question answering tools or description tools to observe the real image. Visual ChatGPT is able to use tools in a sequence, and is loyal to the tool observation outputs rather than faking the image content and image file name. It will remember to provide the file name from the last tool observation, if a new image is generated.
+Human may provide new figures to Visual ChatGPT with a description. The description helps Visual ChatGPT to understand this image, but Visual ChatGPT should use tools to finish following tasks, rather than directly imagine from the description.
+Overall, Visual ChatGPT is a powerful visual dialogue assistant tool that can help with a wide range of tasks and provide valuable insights and information on a wide range of topics.
+TOOLS:
+------
+Visual ChatGPT  has access to the following tools:"""
+VISUAL_CHATGPT_FORMAT_INSTRUCTIONS = """To use a tool, please use the following format:
+```
+Thought: Do I need to use a tool? Yes
+Action: the action to take, should be one of [{tool_names}]
+Action Input: the input to the action
+Observation: the result of the action
+```
+When you have a response to say to the Human, or if you do not need to use a tool, you MUST use the format:
+```
+Thought: Do I need to use a tool? No
+{ai_prefix}: [your response here]
+```
+"""
+VISUAL_CHATGPT_SUFFIX = """You are very strict to the filename correctness and will never fake a file name if it does not exist.
+You will remember to provide the image file name loyally if it's provided in the last tool observation.
+Begin!
+Previous conversation history:
+{chat_history}
+New input: {input}
+Since Visual ChatGPT is a text language model, Visual ChatGPT must use tools to observe images rather than imagination.
+The thoughts and observations are only visible for Visual ChatGPT, Visual ChatGPT should remember to repeat important information in the final response for Human.
+Thought: Do I need to use a tool? {agent_scratchpad}"""
+ENDPOINT = "http://localhost:7860"
+T2IAPI = ENDPOINT + "/controlnet/txt2img"
+DETECTAPI = ENDPOINT + "/controlnet/detect"
+MODELLIST = ENDPOINT + "/controlnet/model_list"
+device = "cpu"
+if torch.cuda.is_available():
+    device = "cuda"
+def readImage(path):
+    img = cv2.imread(path)
+    retval, buffer = cv2.imencode('.jpg', img)
+    b64img = b64encode(buffer).decode("utf-8")
+    return b64img
+def get_model(pattern='^control_canny.*'):
+    r = requests.get(MODELLIST)
+    result = r.json()["model_list"]
+    for item in result:
+        if re.match(pattern, item):
+            return item
+def do_webui_request(url=T2IAPI, **kwargs):
+    reqbody = {
+        "prompt": "best quality, extremely detailed",
+        "negative_prompt": "longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
+        "seed": -1,
+        "subseed": -1,
+        "subseed_strength": 0,
+        "batch_size": 1,
+        "n_iter": 1,
+        "steps": 15,
+        "cfg_scale": 7,
+        "width": 512,
+        "height": 768,
+        "restore_faces": True,
+        "eta": 0,
+        "sampler_index": "Euler a",
+        "controlnet_input_images": [],
+        "controlnet_module": 'canny',
+        "controlnet_model": 'control_canny-fp16 [e3fe7712]',
+        "controlnet_guidance": 1.0,
+    }
+    reqbody.update(kwargs)
+    r = requests.post(url, json=reqbody)
+    return r.json()
+def cut_dialogue_history(history_memory, keep_last_n_words=500):
+    tokens = history_memory.split()
+    n_tokens = len(tokens)
+    print(f"hitory_memory:{history_memory}, n_tokens: {n_tokens}")
+    if n_tokens < keep_last_n_words:
+        return history_memory
+    else:
+        paragraphs = history_memory.split('\n')
+        last_n_tokens = n_tokens
+        while last_n_tokens >= keep_last_n_words:
+            last_n_tokens = last_n_tokens - len(paragraphs[0].split(' '))
+            paragraphs = paragraphs[1:]
+        return '\n' + '\n'.join(paragraphs)
+def get_new_image_name(org_img_name, func_name="update"):
+    head_tail = os.path.split(org_img_name)
+    head = head_tail[0]
+    tail = head_tail[1]
+    name_split = tail.split('.')[0].split('_')
+    this_new_uuid = str(uuid.uuid4())[0:4]
+    if len(name_split) == 1:
+        most_org_file_name = name_split[0]
+        recent_prev_file_name = name_split[0]
+        new_file_name = '{}_{}_{}_{}.png'.format(this_new_uuid, func_name, recent_prev_file_name, most_org_file_name)
+    else:
+        assert len(name_split) == 4
+        most_org_file_name = name_split[3]
+        recent_prev_file_name = name_split[0]
+        new_file_name = '{}_{}_{}_{}.png'.format(this_new_uuid, func_name, recent_prev_file_name, most_org_file_name)
+    return os.path.join(head, new_file_name)
+class MaskFormer:
+    def __init__(self, device):
+        self.device = device
+        self.processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
+        self.model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined").to(device)
+    def inference(self, image_path, text):
+        threshold = 0.5
+        min_area = 0.02
+        padding = 20
+        original_image = Image.open(image_path)
+        image = original_image.resize((512, 512))
+        inputs = self.processor(text=text, images=image, padding="max_length", return_tensors="pt",).to(self.device)
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        mask = torch.sigmoid(outputs[0]).squeeze().cpu().numpy() > threshold
+        area_ratio = len(np.argwhere(mask)) / (mask.shape[0] * mask.shape[1])
+        if area_ratio < min_area:
+            return None
+        true_indices = np.argwhere(mask)
+        mask_array = np.zeros_like(mask, dtype=bool)
+        for idx in true_indices:
+            padded_slice = tuple(slice(max(0, i - padding), i + padding + 1) for i in idx)
+            mask_array[padded_slice] = True
+        visual_mask = (mask_array * 255).astype(np.uint8)
+        image_mask = Image.fromarray(visual_mask)
+        return image_mask.resize(image.size)
+# class ImageEditing:
+#     def __init__(self, device):
+#         print("Initializing StableDiffusionInpaint to %s" % device)
+#         self.device = device
+#         self.mask_former = MaskFormer(device=self.device)
+#         # self.inpainting = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting",).to(device)
+#     def remove_part_of_image(self, input):
+#         image_path, to_be_removed_txt = input.split(",")
+#         print(f'remove_part_of_image: to_be_removed {to_be_removed_txt}')
+#         return self.replace_part_of_image(f"{image_path},{to_be_removed_txt},background")
+#     def replace_part_of_image(self, input):
+#         image_path, to_be_replaced_txt, replace_with_txt = input.split(",")
+#         print(f'replace_part_of_image: replace_with_txt {replace_with_txt}')
+#         mask_image = self.mask_former.inference(image_path, to_be_replaced_txt)
+#         buffered = io.BytesIO()
+#         mask_image.save(buffered, format="JPEG")
+#         resp = do_webui_request(
+#             url=ENDPOINT + "/sdapi/v1/img2img",
+#             init_images=[readImage(image_path)],
+#             mask=b64encode(buffered.getvalue()).decode("utf-8"),
+#             prompt=replace_with_txt,
+#         )
+#         image = Image.open(io.BytesIO(base64.b64decode(resp["images"][0])))
+#         updated_image_path = get_new_image_name(image_path, func_name="replace-something")
+#         updated_image.save(updated_image_path)
+#         return updated_image_path
+# class Pix2Pix:
+#     def __init__(self, device):
+#         print("Initializing Pix2Pix to %s" % device)
+#         self.device = device
+#         self.pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained("timbrooks/instruct-pix2pix", torch_dtype=torch.float16, safety_checker=None).to(device)
+#         self.pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(self.pipe.scheduler.config)
+#     def inference(self, inputs):
+#         """Change style of image."""
+#         print("===>Starting Pix2Pix Inference")
+#         image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
+#         original_image = Image.open(image_path)
+#         image = self.pipe(instruct_text,image=original_image,num_inference_steps=40,image_guidance_scale=1.2,).images[0]
+#         updated_image_path = get_new_image_name(image_path, func_name="pix2pix")
+#         image.save(updated_image_path)
+#         return updated_image_path
+class T2I:
+    def __init__(self, device):
+        print("Initializing T2I to %s" % device)
+        self.device = device
+        self.text_refine_tokenizer = AutoTokenizer.from_pretrained("Gustavosta/MagicPrompt-Stable-Diffusion")
+        self.text_refine_model = AutoModelForCausalLM.from_pretrained("Gustavosta/MagicPrompt-Stable-Diffusion")
+        self.text_refine_gpt2_pipe = pipeline("text-generation", model=self.text_refine_model, tokenizer=self.text_refine_tokenizer, device=self.device)
+    def inference(self, text):
+        image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
+        refined_text = self.text_refine_gpt2_pipe(text)[0]["generated_text"]
+        print(f'{text} refined to {refined_text}')
+        resp = do_webui_request(
+            url=ENDPOINT + "/sdapi/v1/txt2img",
+            prompt=refined_text,
+        )
+        image = Image.open(io.BytesIO(base64.b64decode(resp["images"][0])))
+        image.save(image_filename)
+        print(f"Processed T2I.run, text: {text}, image_filename: {image_filename}")
+        return image_filename
+class ImageCaptioning:
+    def __init__(self, device):
+        print("Initializing ImageCaptioning to %s" % device)
+        self.device = device
+        self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+        self.model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(self.device)
+    def inference(self, image_path):
+        inputs = self.processor(Image.open(image_path), return_tensors="pt").to(self.device)
+        out = self.model.generate(**inputs)
+        captions = self.processor.decode(out[0], skip_special_tokens=True)
+        return captions
+class image2canny:
+    def inference(self, inputs):
+        print("===>Starting image2canny Inference")
+        resp = do_webui_request(
+            url=DETECTAPI,
+            controlnet_input_images=[readImage(inputs)],
+            controlnet_module="segmentation",
+        )
+        updated_image_path = get_new_image_name(inputs, func_name="edge")
+        image.save(updated_image_path)
+        return updated_image_path
+class canny2image:
+    def inference(self, inputs):
+        print("===>Starting canny2image Inference")
+        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
+        resp = do_webui_request(
+            prompt=instruct_text,
+            controlnet_input_images=[readImage(image_path)],
+            controlnet_module="none",
+            controlnet_model=get_model(pattern='^control_canny.*'),
+        )
+        image = Image.open(io.BytesIO(base64.b64decode(resp["images"][0])))
+        updated_image_path = get_new_image_name(image_path, func_name="canny2image")
+        real_image = Image.fromarray(x_samples[0])
+        real_image.save(updated_image_path)
+        return updated_image_path
+class image2line:
+    def inference(self, inputs):
+        print("===>Starting image2hough Inference")
+        resp = do_webui_request(
+            url=DETECTAPI,
+            controlnet_input_images=[readImage(inputs)],
+            controlnet_module="mlsd",
+        )
+        updated_image_path = get_new_image_name(inputs, func_name="line-of")
+        image = Image.open(io.BytesIO(base64.b64decode(resp["images"][0])))
+        image.save(updated_image_path)
+        return updated_image_path
+class line2image:
+    def inference(self, inputs):
+        print("===>Starting line2image Inference")
+        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
+        resp = do_webui_request(
+            prompt=instruct_text,
+            controlnet_input_images=[readImage(image_path)],
+            controlnet_module="none",
+            controlnet_model=get_model(pattern='^control_mlsd.*'),
+        )
+        image = Image.open(io.BytesIO(base64.b64decode(resp["images"][0])))
+        updated_image_path = get_new_image_name(image_path, func_name="line2image")
+        real_image = Image.fromarray(x_samples[0])  # default the index0 image
+        real_image.save(updated_image_path)
+        return updated_image_path
+class image2hed:
+    def inference(self, inputs):
+        print("===>Starting image2hed Inference")
+        resp = do_webui_request(
+            url=DETECTAPI,
+            controlnet_input_images=[readImage(inputs)],
+            controlnet_module="hed",
+        )
+        image = Image.open(io.BytesIO(base64.b64decode(resp["images"][0])))
+        updated_image_path = get_new_image_name(inputs, func_name="hed-boundary")
+        image.save(updated_image_path)
+        return updated_image_path
+class hed2image:
+    def inference(self, inputs):
+        print("===>Starting hed2image Inference")
+        resp = do_webui_request(
+            prompt=instruct_text,
+            controlnet_input_images=[readImage(image_path)],
+            controlnet_module="none",
+            controlnet_model=get_model(pattern='^control_hed.*'),
+        )
+        image = Image.open(io.BytesIO(base64.b64decode(resp["images"][0])))
+        updated_image_path = get_new_image_name(image_path, func_name="hed2image")
+        real_image = Image.fromarray(x_samples[0])  # default the index0 image
+        real_image.save(updated_image_path)
+        return updated_image_path
+class image2scribble:
+    def inference(self, inputs):
+        print("===>Starting image2scribble Inference")
+        resp = do_webui_request(
+            url=DETECTAPI,
+            controlnet_input_images=[readImage(inputs)],
+            controlnet_module="scribble",
+        )
+        image = Image.open(io.BytesIO(base64.b64decode(resp["images"][0])))
+        updated_image_path = get_new_image_name(inputs, func_name="scribble")
+        image.save(updated_image_path)
+        return updated_image_path
+class scribble2image:
+    def inference(self, inputs):
+        print("===>Starting seg2image Inference")
+        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
+        resp = do_webui_request(
+            prompt=instruct_text,
+            controlnet_input_images=[readImage(image_path)],
+            controlnet_module="none",
+            controlnet_model=get_model(pattern='^control_scribble.*'),
+        )
+        image = Image.open(io.BytesIO(base64.b64decode(resp["images"][0])))
+        updated_image_path = get_new_image_name(image_path, func_name="scribble2image")
+        real_image = Image.fromarray(x_samples[0])
+        real_image.save(updated_image_path)
+        return updated_image_path
+class image2pose:
+    def inference(self, inputs):
+        print("===>Starting image2pose Inference")
+        resp = do_webui_request(
+            url=DETECTAPI,
+            controlnet_input_images=[readImage(inputs)],
+            controlnet_module="openpose",
+        )
+        image = Image.open(io.BytesIO(base64.b64decode(resp["images"][0])))
+        updated_image_path = get_new_image_name(inputs, func_name="human-pose")
+        image.save(updated_image_path)
+        return updated_image_path
+class pose2image:
+    def inference(self, inputs):
+        print("===>Starting pose2image Inference")
+        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
+        resp = do_webui_request(
+            prompt=instruct_text,
+            controlnet_input_images=[readImage(image_path)],
+            controlnet_module="none",
+            controlnet_model=get_model(pattern='^control_openpose.*'),
+        )
+        image = Image.open(io.BytesIO(base64.b64decode(resp["images"][0])))
+        updated_image_path = get_new_image_name(image_path, func_name="pose2image")
+        real_image = Image.fromarray(x_samples[0])  # default the index0 image
+        real_image.save(updated_image_path)
+        return updated_image_path
+class image2seg:
+    def inference(self, inputs):
+        print("===>Starting image2seg Inference")
+        resp = do_webui_request(
+            url=DETECTAPI,
+            controlnet_input_images=[readImage(inputs)],
+            controlnet_module="segmentation",
+        )
+        image = Image.open(io.BytesIO(base64.b64decode(resp["images"][0])))
+        updated_image_path = get_new_image_name(inputs, func_name="segmentation")
+        image.save(updated_image_path)
+        return updated_image_path
+class seg2image:
+    def inference(self, inputs):
+        print("===>Starting seg2image Inference")
+        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
+        resp = do_webui_request(
+            prompt=instruct_text,
+            controlnet_input_images=[readImage(image_path)],
+            controlnet_module="none",
+            controlnet_model=get_model(pattern='^control_seg.*'),
+        )
+        image = Image.open(io.BytesIO(base64.b64decode(resp["images"][0])))
+        updated_image_path = get_new_image_name(image_path, func_name="segment2image")
+        real_image = Image.fromarray(x_samples[0])
+        real_image.save(updated_image_path)
+        return updated_image_path
+class image2depth:
+    def inference(self, inputs):
+        print("===>Starting image2depth Inference")
+        resp = do_webui_request(
+            url=DETECTAPI,
+            controlnet_input_images=[readImage(inputs)],
+            controlnet_module="depth",
+        )
+        image = Image.open(io.BytesIO(base64.b64decode(resp["images"][0])))
+        updated_image_path = get_new_image_name(inputs, func_name="depth")
+        image.save(updated_image_path)
+        return updated_image_path
+class depth2image:
+    def inference(self, inputs):
+        print("===>Starting depth2image Inference")
+        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
+        resp = do_webui_request(
+            prompt=instruct_text,
+            controlnet_input_images=[readImage(image_path)],
+            controlnet_module="depth",
+            controlnet_model=get_model(pattern='^control_depth.*'),
+        )
+        image = Image.open(io.BytesIO(base64.b64decode(resp["images"][0])))
+        updated_image_path = get_new_image_name(image_path, func_name="depth2image")
+        real_image = Image.fromarray(x_samples[0])  # default the index0 image
+        real_image.save(updated_image_path)
+        return updated_image_path
+class image2normal:
+    def inference(self, inputs):
+        print("===>Starting image2 normal Inference")
+        resp = do_webui_request(
+            url=DETECTAPI,
+            controlnet_input_images=[readImage(inputs)],
+            controlnet_module="normal",
+        )
+        image = Image.open(io.BytesIO(base64.b64decode(resp["images"][0])))
+        updated_image_path = get_new_image_name(inputs, func_name="normal-map")
+        image.save(updated_image_path)
+        return updated_image_path
+class normal2image:
+    def inference(self, inputs):
+        print("===>Starting normal2image Inference")
+        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
+        resp = do_webui_request(
+            prompt=instruct_text,
+            controlnet_input_images=[readImage(image_path)],
+            controlnet_module="normal",
+            controlnet_model=get_model(pattern='^control_normal.*'),
+        )
+        image = Image.open(io.BytesIO(base64.b64decode(resp["images"][0])))
+        updated_image_path = get_new_image_name(image_path, func_name="normal2image")
+        real_image = Image.fromarray(x_samples[0])  # default the index0 image
+        real_image.save(updated_image_path)
+        return updated_image_path
+class BLIPVQA:
+    def __init__(self, device):
+        print("Initializing BLIP VQA to %s" % device)
+        self.device = device
+        self.processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
+        self.model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(self.device)
+    def get_answer_from_question_and_image(self, inputs):
+        image_path, question = inputs.split(",")
+        raw_image = Image.open(image_path).convert('RGB')
+        print(F'BLIPVQA :question :{question}')
+        inputs = self.processor(raw_image, question, return_tensors="pt").to(self.device)
+        out = self.model.generate(**inputs)
+        answer = self.processor.decode(out[0], skip_special_tokens=True)
+        return answer
+class ConversationBot:
+    def __init__(self):
+        print("Initializing VisualChatGPT")
+        # self.edit = ImageEditing(device=device)
+        self.i2t = ImageCaptioning(device=device)
+        self.t2i = T2I(device=device)
+        self.image2canny = image2canny()
+        self.canny2image = canny2image()
+        self.image2line = image2line()
+        self.line2image = line2image()
+        self.image2hed = image2hed()
+        self.hed2image = hed2image()
+        self.image2scribble = image2scribble()
+        self.scribble2image = scribble2image()
+        self.image2pose = image2pose()
+        self.pose2image = pose2image()
+        self.BLIPVQA = BLIPVQA(device=device)
+        self.image2seg = image2seg()
+        self.seg2image = seg2image()
+        self.image2depth = image2depth()
+        self.depth2image = depth2image()
+        self.image2normal = image2normal()
+        self.normal2image = normal2image()
+        # self.pix2pix = Pix2Pix(device="cuda:3")
+        self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
+        self.tools = [
+            Tool(name="Get Photo Description", func=self.i2t.inference,
+                 description="useful when you want to know what is inside the photo. receives image_path as input. "
+                             "The input to this tool should be a string, representing the image_path. "),
+            Tool(name="Generate Image From User Input Text", func=self.t2i.inference,
+                 description="useful when you want to generate an image from a user input text and save it to a file. like: generate an image of an object or something, or generate an image that includes some objects. "
+                             "The input to this tool should be a string, representing the text used to generate image. "),
+            # Tool(name="Remove Something From The Photo", func=self.edit.remove_part_of_image,
+            #      description="useful when you want to remove and object or something from the photo from its description or location. "
+            #                  "The input to this tool should be a comma seperated string of two, representing the image_path and the object need to be removed. "),
+            # Tool(name="Replace Something From The Photo", func=self.edit.replace_part_of_image,
+            #      description="useful when you want to replace an object from the object description or location with another object from its description. "
+            #                  "The input to this tool should be a comma seperated string of three, representing the image_path, the object to be replaced, the object to be replaced with "),
+            # Tool(name="Instruct Image Using Text", func=self.pix2pix.inference,
+            #      description="useful when you want to the style of the image to be like the text. like: make it look like a painting. or make it like a robot. "
+            #                  "The input to this tool should be a comma seperated string of two, representing the image_path and the text. "),
+            Tool(name="Answer Question About The Image", func=self.BLIPVQA.get_answer_from_question_and_image,
+                 description="useful when you need an answer for a question based on an image. like: what is the background color of the last image, how many cats in this figure, what is in this figure. "
+                             "The input to this tool should be a comma seperated string of two, representing the image_path and the question"),
+            Tool(name="Edge Detection On Image", func=self.image2canny.inference,
+                 description="useful when you want to detect the edge of the image. like: detect the edges of this image, or canny detection on image, or peform edge detection on this image, or detect the canny image of this image. "
+                             "The input to this tool should be a string, representing the image_path"),
+            Tool(name="Generate Image Condition On Canny Image", func=self.canny2image.inference,
+                 description="useful when you want to generate a new real image from both the user desciption and a canny image. like: generate a real image of a object or something from this canny image, or generate a new real image of a object or something from this edge image. "
+                             "The input to this tool should be a comma seperated string of two, representing the image_path and the user description. "),
+            Tool(name="Line Detection On Image", func=self.image2line.inference,
+                 description="useful when you want to detect the straight line of the image. like: detect the straight lines of this image, or straight line detection on image, or peform straight line detection on this image, or detect the straight line image of this image. "
+                             "The input to this tool should be a string, representing the image_path"),
+            Tool(name="Generate Image Condition On Line Image", func=self.line2image.inference,
+                 description="useful when you want to generate a new real image from both the user desciption and a straight line image. like: generate a real image of a object or something from this straight line image, or generate a new real image of a object or something from this straight lines. "
+                             "The input to this tool should be a comma seperated string of two, representing the image_path and the user description. "),
+            Tool(name="Hed Detection On Image", func=self.image2hed.inference,
+                 description="useful when you want to detect the soft hed boundary of the image. like: detect the soft hed boundary of this image, or hed boundary detection on image, or peform hed boundary detection on this image, or detect soft hed boundary image of this image. "
+                             "The input to this tool should be a string, representing the image_path"),
+            Tool(name="Generate Image Condition On Soft Hed Boundary Image", func=self.hed2image.inference,
+                 description="useful when you want to generate a new real image from both the user desciption and a soft hed boundary image. like: generate a real image of a object or something from this soft hed boundary image, or generate a new real image of a object or something from this hed boundary. "
+                             "The input to this tool should be a comma seperated string of two, representing the image_path and the user description"),
+            Tool(name="Segmentation On Image", func=self.image2seg.inference,
+                 description="useful when you want to detect segmentations of the image. like: segment this image, or generate segmentations on this image, or peform segmentation on this image. "
+                             "The input to this tool should be a string, representing the image_path"),
+            Tool(name="Generate Image Condition On Segmentations", func=self.seg2image.inference,
+                 description="useful when you want to generate a new real image from both the user desciption and segmentations. like: generate a real image of a object or something from this segmentation image, or generate a new real image of a object or something from these segmentations. "
+                             "The input to this tool should be a comma seperated string of two, representing the image_path and the user description"),
+            Tool(name="Predict Depth On Image", func=self.image2depth.inference,
+                 description="useful when you want to detect depth of the image. like: generate the depth from this image, or detect the depth map on this image, or predict the depth for this image. "
+                             "The input to this tool should be a string, representing the image_path"),
+            Tool(name="Generate Image Condition On Depth",  func=self.depth2image.inference,
+                 description="useful when you want to generate a new real image from both the user desciption and depth image. like: generate a real image of a object or something from this depth image, or generate a new real image of a object or something from the depth map. "
+                             "The input to this tool should be a comma seperated string of two, representing the image_path and the user description"),
+            Tool(name="Predict Normal Map On Image", func=self.image2normal.inference,
+                 description="useful when you want to detect norm map of the image. like: generate normal map from this image, or predict normal map of this image. "
+                             "The input to this tool should be a string, representing the image_path"),
+            Tool(name="Generate Image Condition On Normal Map", func=self.normal2image.inference,
+                 description="useful when you want to generate a new real image from both the user desciption and normal map. like: generate a real image of a object or something from this normal map, or generate a new real image of a object or something from the normal map. "
+                             "The input to this tool should be a comma seperated string of two, representing the image_path and the user description"),
+            Tool(name="Sketch Detection On Image", func=self.image2scribble.inference,
+                 description="useful when you want to generate a scribble of the image. like: generate a scribble of this image, or generate a sketch from this image, detect the sketch from this image. "
+                             "The input to this tool should be a string, representing the image_path"),
+            Tool(name="Generate Image Condition On Sketch Image", func=self.scribble2image.inference,
+                 description="useful when you want to generate a new real image from both the user desciption and a scribble image or a sketch image. "
+                             "The input to this tool should be a comma seperated string of two, representing the image_path and the user description"),
+            Tool(name="Pose Detection On Image", func=self.image2pose.inference,
+                 description="useful when you want to detect the human pose of the image. like: generate human poses of this image, or generate a pose image from this image. "
+                             "The input to this tool should be a string, representing the image_path"),
+            Tool(name="Generate Image Condition On Pose Image", func=self.pose2image.inference,
+                 description="useful when you want to generate a new real image from both the user desciption and a human pose image. like: generate a real image of a human from this human pose image, or generate a new real image of a human from this pose. "
+                             "The input to this tool should be a comma seperated string of two, representing the image_path and the user description")]
+    def init_langchain(self, openai_api_key):
+        self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
+        self.agent = initialize_agent(
+            self.tools,
+            self.llm,
+            agent="conversational-react-description",
+            verbose=True,
+            memory=self.memory,
+            return_intermediate_steps=True,
+            agent_kwargs={'prefix': VISUAL_CHATGPT_PREFIX, 'format_instructions': VISUAL_CHATGPT_FORMAT_INSTRUCTIONS, 'suffix': VISUAL_CHATGPT_SUFFIX}
+        )
+    def run_text(self, openai_api_key, text, state):
+        if not hasattr(self, "agent"):
+            self.init_langchain(openai_api_key)
+        print("===============Running run_text =============")
+        print("Inputs:", text, state)
+        print("======>Previous memory:\n %s" % self.agent.memory)
+        self.agent.memory.buffer = cut_dialogue_history(self.agent.memory.buffer, keep_last_n_words=500)
+        res = self.agent({"input": text})
+        print("======>Current memory:\n %s" % self.agent.memory)
+        response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
+        state = state + [(text, response)]
+        print("Outputs:", state)
+        return state, state
+    def run_image(self, openai_api_key, image, state, txt):
+        if not hasattr(self, "agent"):
+            self.init_langchain(openai_api_key)
+        print("===============Running run_image =============")
+        print("Inputs:", image, state)
+        print("======>Previous memory:\n %s" % self.agent.memory)
+        image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
+        print("======>Auto Resize Image...")
+        img = Image.open(image.name)
+        width, height = img.size
+        ratio = min(512 / width, 512 / height)
+        width_new, height_new = (round(width * ratio), round(height * ratio))
+        img = img.resize((width_new, height_new))
+        img = img.convert('RGB')
+        img.save(image_filename, "PNG")
+        print(f"Resize image form {width}x{height} to {width_new}x{height_new}")
+        description = self.i2t.inference(image_filename)
+        Human_prompt = "\nHuman: provide a figure named {}. The description is: {}. This information helps you to understand this image, but you should use tools to finish following tasks, " \
+                       "rather than directly imagine from my description. If you understand, say \"Received\". \n".format(image_filename, description)
+        AI_prompt = "Received.  "
+        self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
+        print("======>Current memory:\n %s" % self.agent.memory)
+        state = state + [(f"![](/file={image_filename})*{image_filename}*", AI_prompt)]
+        print("Outputs:", state)
+        return state, state, txt + ' ' + image_filename + ' '
+if __name__ == '__main__':
+    os.makedirs("image/", exist_ok=True)
+    bot = ConversationBot()
+    with gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}") as demo:
+        openai_api_key = gr.Textbox(type="password", label="Enter your OpenAI API key here")
+        chatbot = gr.Chatbot(elem_id="chatbot", label="Visual ChatGPT")
+        state = gr.State([])
+        with gr.Row():
+            with gr.Column(scale=0.7):
+                txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter, or upload an image").style(container=False)
+            with gr.Column(scale=0.15, min_width=0):
+                clear = gr.Button("Clear️")
+            with gr.Column(scale=0.15, min_width=0):
+                btn = gr.UploadButton("Upload", file_types=["image"])
+        txt.submit(bot.run_text, [openai_api_key, txt, state], [chatbot, state])
+        txt.submit(lambda: "", None, txt)
+        btn.upload(bot.run_image, [openai_api_key, btn, state, txt], [chatbot, state, txt])
+        clear.click(bot.memory.clear)
+        clear.click(lambda: [], None, chatbot)
+        clear.click(lambda: [], None, state)
+    demo.launch(server_name="0.0.0.0", server_port=7864)

extensions/microsoftexcel-controlnet/example/visual_chatgpt.ipynb ADDED Viewed

	@@ -0,0 +1,60 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run WebUI in API mode\n",
+    "nohup python launch.py --api --xformers &\n",
+    "\n",
+    "# Wait until webui fully startup\n",
+    "tail -f nohup.out"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Install/Upgrade transformers\n",
+    "pip install -U transformers\n",
+    "\n",
+    "# Install deps\n",
+    "pip install langchain==0.0.101 openai \n",
+    "\n",
+    "# Run exmaple\n",
+    "python example/chatgpt.py"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pynb",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "d73345514d8c18d9a1da7351d222dbd2834c7f4a09e728a0d1f4c4580fbec206"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

extensions/microsoftexcel-controlnet/extract_controlnet.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import argparse
+import torch
+from safetensors.torch import load_file, save_file
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--src", default=None, type=str, required=True, help="Path to the model to convert.")
+    parser.add_argument("--dst", default=None, type=str, required=True, help="Path to the output model.")
+    parser.add_argument("--half", action="store_true", help="Cast to FP16.")
+    args = parser.parse_args()
+    assert args.src is not None, "Must provide a model path!"
+    assert args.dst is not None, "Must provide a checkpoint path!"
+    if args.src.endswith(".safetensors"):
+        state_dict = load_file(args.src)
+    else:
+        state_dict = torch.load(args.src)
+    if any([k.startswith("control_model.") for k, v in state_dict.items()]):
+        dtype = torch.float16 if args.half else torch.float32
+        state_dict = {k.replace("control_model.", ""): v.to(dtype) for k, v in state_dict.items() if k.startswith("control_model.")}
+    if args.dst.endswith(".safetensors"):
+        save_file(state_dict, args.dst)
+    else:
+        torch.save({"state_dict": state_dict}, args.dst)

extensions/microsoftexcel-controlnet/extract_controlnet_diff.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import argparse
+import torch
+from safetensors.torch import load_file, save_file
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--sd15", default=None, type=str, required=True, help="Path to the original sd15.")
+    parser.add_argument("--control", default=None, type=str, required=True, help="Path to the sd15 with control.")
+    parser.add_argument("--dst", default=None, type=str, required=True, help="Path to the output difference model.")
+    parser.add_argument("--fp16", action="store_true", help="Save as fp16.")
+    parser.add_argument("--bf16", action="store_true", help="Save as bf16.")
+    args = parser.parse_args()
+    assert args.sd15 is not None, "Must provide a original sd15 model path!"
+    assert args.control is not None, "Must provide a sd15 with control model path!"
+    assert args.dst is not None, "Must provide a output path!"
+    # make differences: copy from https://github.com/lllyasviel/ControlNet/blob/main/tool_transfer_control.py
+    def get_node_name(name, parent_name):
+        if len(name) <= len(parent_name):
+            return False, ''
+        p = name[:len(parent_name)]
+        if p != parent_name:
+            return False, ''
+        return True, name[len(parent_name):]
+    # remove first/cond stage from sd to reduce memory usage
+    def remove_first_and_cond(sd):
+        keys = list(sd.keys())
+        for key in keys:
+            is_first_stage, _ = get_node_name(key, 'first_stage_model')
+            is_cond_stage, _ = get_node_name(key, 'cond_stage_model')
+            if is_first_stage or is_cond_stage:
+                sd.pop(key, None)
+        return sd
+    print(f"loading: {args.sd15}")
+    if args.sd15.endswith(".safetensors"):
+        sd15_state_dict = load_file(args.sd15)
+    else:
+        sd15_state_dict = torch.load(args.sd15)
+        sd15_state_dict = sd15_state_dict.pop("state_dict", sd15_state_dict)
+    sd15_state_dict = remove_first_and_cond(sd15_state_dict)
+    print(f"loading: {args.control}")
+    if args.control.endswith(".safetensors"):
+        control_state_dict = load_file(args.control)
+    else:
+        control_state_dict = torch.load(args.control)
+    control_state_dict = remove_first_and_cond(control_state_dict)
+    # make diff of original and control
+    print(f"create difference")
+    keys = list(control_state_dict.keys())
+    final_state_dict = {"difference": torch.tensor(1.0)}                  # indicates difference
+    for key in keys:
+        p = control_state_dict.pop(key)
+        is_control, node_name = get_node_name(key, 'control_')
+        if not is_control:
+            continue
+        sd15_key_name = 'model.diffusion_' + node_name
+        if sd15_key_name in sd15_state_dict:                              # part of U-Net
+            # print("in sd15", key, sd15_key_name)
+            p_new = p - sd15_state_dict.pop(sd15_key_name)
+            if torch.max(torch.abs(p_new)) < 1e-6:                        # no difference?
+                print("no diff", key, sd15_key_name)
+                continue
+        else:
+            # print("not in sd15", key, sd15_key_name)
+            p_new = p                                                     # hint or zero_conv
+        final_state_dict[key] = p_new
+    save_dtype = None
+    if args.fp16:
+        save_dtype = torch.float16
+    elif args.bf16:
+        save_dtype = torch.bfloat16
+    if save_dtype is not None:
+        for key in final_state_dict.keys():
+            final_state_dict[key] = final_state_dict[key].to(save_dtype)
+    print("saving difference.")
+    if args.dst.endswith(".safetensors"):
+        save_file(final_state_dict, args.dst)
+    else:
+        torch.save({"state_dict": final_state_dict}, args.dst)
+    print("done!")

extensions/microsoftexcel-controlnet/install.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import launch
+import os
+import pkg_resources
+req_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), "requirements.txt")
+with open(req_file) as file:
+    for package in file:
+        try:
+            package = package.strip()
+            if '==' in package:
+                package_name, package_version = package.split('==')
+                installed_version = pkg_resources.get_distribution(package_name).version
+                if installed_version != package_version:
+                    launch.run_pip(f"install {package}", f"sd-webui-controlnet requirement: changing {package_name} version from {installed_version} to {package_version}")
+            elif not launch.is_installed(package):
+                launch.run_pip(f"install {package}", f"sd-webui-controlnet requirement: {package}")
+        except Exception as e:
+            print(e)
+            print(f'Warning: Failed to install {package}, some preprocessors may not work.')

extensions/microsoftexcel-controlnet/javascript/hints.js ADDED Viewed

	@@ -0,0 +1,17 @@

+onUiUpdate(function () {
+    // mouseover tooltips for various UI elements
+    const titles = {
+        '🔄': 'Refresh',
+        '\u2934': 'Send dimensions to stable diffusion',
+        '💥': 'Run preprocessor',
+        '📝': 'Open new canvas',
+        '📷': 'Enable webcam',
+        '⇄': 'Mirror webcam',
+    };
+    gradioApp().querySelectorAll('.cnet-toolbutton').forEach(function (button) {
+        const tooltip = titles[button.textContent];
+        if (tooltip) {
+            button.title = tooltip;
+        }
+    })
+});

extensions/microsoftexcel-controlnet/models/cldm_v15.yaml ADDED Viewed

	@@ -0,0 +1,79 @@

+model:
+  target: cldm.cldm.ControlLDM
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    control_key: "hint"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+    only_mid_control: False
+    control_stage_config:
+      target: cldm.cldm.ControlNet
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        hint_channels: 3
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    unet_config:
+      target: cldm.cldm.ControlledUnetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder

extensions/microsoftexcel-controlnet/models/cldm_v21.yaml ADDED Viewed

	@@ -0,0 +1,85 @@

+model:
+  target: cldm.cldm.ControlLDM
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    control_key: "hint"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+    only_mid_control: False
+    control_stage_config:
+      target: cldm.cldm.ControlNet
+      params:
+        use_checkpoint: True
+        image_size: 32 # unused
+        in_channels: 4
+        hint_channels: 3
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_head_channels: 64 # need to fix for flash-attn
+        use_spatial_transformer: True
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        legacy: False
+    unet_config:
+      target: cldm.cldm.ControlledUnetModel
+      params:
+        use_checkpoint: True
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_head_channels: 64 # need to fix for flash-attn
+        use_spatial_transformer: True
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          #attn_type: "vanilla-xformers"
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
+      params:
+        freeze: True
+        layer: "penultimate"

extensions/microsoftexcel-controlnet/models/control_sd15_canny.yaml ADDED Viewed

	@@ -0,0 +1,79 @@

+model:
+  target: cldm.cldm.ControlLDM
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    control_key: "hint"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+    only_mid_control: False
+    control_stage_config:
+      target: cldm.cldm.ControlNet
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        hint_channels: 3
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    unet_config:
+      target: cldm.cldm.ControlledUnetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder

extensions/microsoftexcel-controlnet/models/control_sd15_depth.yaml ADDED Viewed

	@@ -0,0 +1,79 @@

+model:
+  target: cldm.cldm.ControlLDM
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    control_key: "hint"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+    only_mid_control: False
+    control_stage_config:
+      target: cldm.cldm.ControlNet
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        hint_channels: 3
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    unet_config:
+      target: cldm.cldm.ControlledUnetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder

extensions/microsoftexcel-controlnet/models/control_sd15_hed.yaml ADDED Viewed

	@@ -0,0 +1,79 @@

+model:
+  target: cldm.cldm.ControlLDM
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    control_key: "hint"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+    only_mid_control: False
+    control_stage_config:
+      target: cldm.cldm.ControlNet
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        hint_channels: 3
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    unet_config:
+      target: cldm.cldm.ControlledUnetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder

extensions/microsoftexcel-controlnet/models/control_sd15_mlsd.yaml ADDED Viewed

	@@ -0,0 +1,79 @@

+model:
+  target: cldm.cldm.ControlLDM
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    control_key: "hint"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+    only_mid_control: False
+    control_stage_config:
+      target: cldm.cldm.ControlNet
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        hint_channels: 3
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    unet_config:
+      target: cldm.cldm.ControlledUnetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder

extensions/microsoftexcel-controlnet/models/control_sd15_normal.yaml ADDED Viewed

	@@ -0,0 +1,79 @@

+model:
+  target: cldm.cldm.ControlLDM
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    control_key: "hint"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+    only_mid_control: False
+    control_stage_config:
+      target: cldm.cldm.ControlNet
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        hint_channels: 3
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    unet_config:
+      target: cldm.cldm.ControlledUnetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder

extensions/microsoftexcel-controlnet/models/control_sd15_openpose.yaml ADDED Viewed

	@@ -0,0 +1,79 @@

+model:
+  target: cldm.cldm.ControlLDM
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    control_key: "hint"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+    only_mid_control: False
+    control_stage_config:
+      target: cldm.cldm.ControlNet
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        hint_channels: 3
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    unet_config:
+      target: cldm.cldm.ControlledUnetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder

extensions/microsoftexcel-controlnet/models/control_sd15_scribble.yaml ADDED Viewed

	@@ -0,0 +1,79 @@

+model:
+  target: cldm.cldm.ControlLDM
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    control_key: "hint"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+    only_mid_control: False
+    control_stage_config:
+      target: cldm.cldm.ControlNet
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        hint_channels: 3
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    unet_config:
+      target: cldm.cldm.ControlledUnetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder

extensions/microsoftexcel-controlnet/models/control_sd15_seg.yaml ADDED Viewed

	@@ -0,0 +1,79 @@

+model:
+  target: cldm.cldm.ControlLDM
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    control_key: "hint"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+    only_mid_control: False
+    control_stage_config:
+      target: cldm.cldm.ControlNet
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        hint_channels: 3
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    unet_config:
+      target: cldm.cldm.ControlledUnetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder

extensions/microsoftexcel-controlnet/models/control_v11e_sd15_ip2p.yaml ADDED Viewed

	@@ -0,0 +1,79 @@

+model:
+  target: cldm.cldm.ControlLDM
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    control_key: "hint"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+    only_mid_control: False
+    control_stage_config:
+      target: cldm.cldm.ControlNet
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        hint_channels: 3
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    unet_config:
+      target: cldm.cldm.ControlledUnetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder

extensions/microsoftexcel-controlnet/models/control_v11e_sd15_shuffle.yaml ADDED Viewed

	@@ -0,0 +1,80 @@

+model:
+  target: cldm.cldm.ControlLDM
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    control_key: "hint"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+    only_mid_control: False
+    global_average_pooling: True
+    control_stage_config:
+      target: cldm.cldm.ControlNet
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        hint_channels: 3
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    unet_config:
+      target: cldm.cldm.ControlledUnetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder

extensions/microsoftexcel-controlnet/models/control_v11f1e_sd15_tile.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2f31868eedb243a77932e3c63907a6ba0a2058b6d65b5c27b89ee1b7f618ea33
+size 722601104

extensions/microsoftexcel-controlnet/models/control_v11f1e_sd15_tile.yaml ADDED Viewed

	@@ -0,0 +1,79 @@

+model:
+  target: cldm.cldm.ControlLDM
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    control_key: "hint"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+    only_mid_control: False
+    control_stage_config:
+      target: cldm.cldm.ControlNet
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        hint_channels: 3
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    unet_config:
+      target: cldm.cldm.ControlledUnetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder

extensions/microsoftexcel-controlnet/models/control_v11f1p_sd15_depth.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6bab8043519c0f563853459c1e4f4e93445a87cef1dcdfa3e1e70115b3c83553
+size 722601100

extensions/microsoftexcel-controlnet/models/control_v11f1p_sd15_depth.yaml ADDED Viewed

	@@ -0,0 +1,79 @@

+model:
+  target: cldm.cldm.ControlLDM
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    control_key: "hint"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+    only_mid_control: False
+    control_stage_config:
+      target: cldm.cldm.ControlNet
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        hint_channels: 3
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    unet_config:
+      target: cldm.cldm.ControlledUnetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder