Spaces:

MykolaL
/

evp

Running on A10G

App Files Files Community

nick_93 commited on Dec 18, 2023

Commit

bcec54e

•

1 Parent(s): 2fb7fdb

init

Browse files

Files changed (36) hide show

.gitignore +166 -0
LICENSE +21 -0
app.py +103 -0
depth/README.md +63 -0
depth/configs/base_options.py +56 -0
depth/configs/test_options.py +27 -0
depth/configs/train_options.py +50 -0
depth/inference.py +53 -0
depth/models_depth/attractor.py +208 -0
depth/models_depth/checkpoint.py +608 -0
depth/models_depth/dist_layers.py +121 -0
depth/models_depth/layers.py +36 -0
depth/models_depth/localbins_layers.py +169 -0
depth/models_depth/miniViT.py +45 -0
depth/models_depth/model.py +666 -0
depth/models_depth/model_vpd.py +252 -0
depth/models_depth/optimizer.py +154 -0
depth/requirements.txt +8 -0
depth/test_img.jpg +0 -0
depth/utils.py +525 -0
depth/utils_depth/criterion.py +22 -0
depth/utils_depth/logging.py +161 -0
depth/utils_depth/metrics.py +79 -0
depth/utils_depth/misc.py +73 -0
depth/v1-inference.yaml +70 -0
evp/__init__.py +1 -0
evp/models.py +349 -0
refer/README.md +78 -0
refer/args.py +42 -0
refer/inference.py +60 -0
refer/models_refer/__init__.py +1 -0
refer/models_refer/model.py +301 -0
refer/requirements.txt +12 -0
refer/transforms.py +126 -0
refer/utils.py +222 -0
refer/v1-inference.yaml +70 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,166 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+*.ckpt
+*.pth
+refer/refer/data/
+depth/kitti_dataset/
+depth/nyu_depth_v2/
+# C extensions
+.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Mykola Lavreniuk
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

app.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import os
+import sys
+depth_directory = os.path.abspath(os.path.join(os.path.dirname(__file__), 'depth'))
+sys.path.append(depth_directory)
+os.chdir(depth_directory)
+import cv2
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+from depth.models_depth.model import EVPDepth
+from depth.configs.train_options import TrainOptions
+from depth.configs.test_options import TestOptions
+import glob
+import utils
+import torchvision.transforms as transforms
+from utils_depth.misc import colorize
+from PIL import Image
+import torch.nn.functional as F
+import gradio as gr
+import tempfile
+css = """
+#img-display-container {
+    max-height: 50vh;
+    }
+#img-display-input {
+    max-height: 40vh;
+    }
+#img-display-output {
+    max-height: 40vh;
+    }
+"""
+def create_demo(model, device):
+    gr.Markdown("### Depth Prediction demo")
+    with gr.Row():
+        input_image = gr.Image(label="Input Image", type='pil', elem_id='img-display-input')
+        depth_image = gr.Image(label="Depth Map", elem_id='img-display-output')
+    raw_file = gr.File(label="16-bit raw depth, multiplier:256")
+    submit = gr.Button("Submit")
+    def on_submit(image):
+        transform = transforms.ToTensor()
+        image = transform(image).unsqueeze(0).to(device)
+        shape = image.shape
+        image = torch.nn.functional.interpolate(image, (440,480), mode='bilinear', align_corners=True)
+        image = F.pad(image, (0, 0, 40, 0))
+        with torch.no_grad():
+            pred = model(image)['pred_d']
+        pred = pred[:,:,40:,:]
+        pred = torch.nn.functional.interpolate(pred, shape[2:], mode='bilinear', align_corners=True)
+        pred_d_numpy = pred.squeeze().cpu().numpy()
+        colored_depth, _, _ = colorize(pred_d_numpy, cmap='gray_r')
+        tmp = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
+        raw_depth = Image.fromarray((pred_d_numpy*256).astype('uint16'))
+        raw_depth.save(tmp.name)
+        return [colored_depth, tmp.name]
+    submit.click(on_submit, inputs=[input_image], outputs=[depth_image, raw_file])
+    examples = gr.Examples(examples=["test_img.jpg"],
+                           inputs=[input_image])
+def main():
+    opt = TestOptions().initialize()
+    opt.add_argument('--img_path',  type=str)
+    args = opt.parse_args()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = EVPDepth(args=args, caption_aggregation=True)
+    cudnn.benchmark = True
+    model.to(device)
+    model_weight = torch.load(args.ckpt_dir)['model']
+    if 'module' in next(iter(model_weight.items()))[0]:
+        model_weight = OrderedDict((k[7:], v) for k, v in model_weight.items())
+    model.load_state_dict(model_weight, strict=False)
+    model.eval()
+    title = "# EVP"
+    description = """Official demo for **EVP: Enhanced Visual Perception using Inverse Multi-Attentive Feature
+    Refinement and Regularized Image-Text Alignment**.
+    EVP is a deep learning model for metric depth estimation from a single image.
+    Please refer to our [paper](https://arxiv.org/abs/2312.08548) or [github](https://github.com/Lavreniuk/EVP) for more details."""
+    with gr.Blocks() as demo:
+        gr.Markdown(title)
+        gr.Markdown(description)
+        with gr.Tab("Depth Prediction"):
+            create_demo(model, device)
+        gr.HTML('''<br><br><br><center>You can duplicate this Space to skip the queue:<a href="https://huggingface.co/spaces/shariqfarooq/ZoeDepth?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a><br>
+                <p><img src="https://visitor-badge.glitch.me/badge?page_id=shariqfarooq.zoedepth_demo_hf" alt="visitors"></p></center>''')
+    demo.queue().launch(share=True)
+if __name__ == '__main__':
+    main()

depth/README.md ADDED Viewed

	@@ -0,0 +1,63 @@

+# Depth Estimation
+## Getting Started
+1. Install the [mmcv-full](https://github.com/open-mmlab/mmcv) library and some required packages.
+```bash
+pip install openmim
+mim install mmcv-full
+pip install -r requirements.txt
+```
+2. Prepare NYUDepthV2 datasets following [GLPDepth](https://github.com/vinvino02/GLPDepth) and [BTS](https://github.com/cleinc/bts/tree/master).
+```
+mkdir nyu_depth_v2
+wget http://horatio.cs.nyu.edu/mit/silberman/nyu_depth_v2/nyu_depth_v2_labeled.mat
+python extract_official_train_test_set_from_mat.py nyu_depth_v2_labeled.mat splits.mat ./nyu_depth_v2/official_splits/
+```
+Download sync.zip provided by the authors of BTS from this [url](https://drive.google.com/file/d/1AysroWpfISmm-yRFGBgFTrLy6FjQwvwP/view) and unzip in `./nyu_depth_v2` folder.
+Your dataset directory should be:
+```
+│nyu_depth_v2/
+├──official_splits/
+│  ├── test
+│  ├── train
+├──sync/
+```
+## Results and Fine-tuned Models
+EVP obtains 0.224 RMSE on NYUv2 depth estimation benchmark, establishing the new state-of-the-art.
+|  | RMSE | d1 | d2 | d3 | REL  | log_10 |
+|---------|-------|-------|--------|------|-------|-------|
+| **EVP** | 0.224 | 0.976 | 0.997 | 0.999 | 0.061 | 0.027 |
+EVP obtains 0.048 REL and 0.136 SqREL on KITTI depth estimation benchmark, establishing the new state-of-the-art.
+|  | REL | SqREL | RMSE | RMSE log | d1 | d2 | d3 |
+|---------|-------|-------|--------|------|-------|-------|-------|
+| **EVP** | 0.048 | 0.136 | 2.015 | 0.073 | 0.980 | 0.998 | 1.000 |
+## Training
+Run the following instuction to train the EVP-Depth model.
+```
+bash train.sh <LOG_DIR>
+```
+## Evaluation
+Command format:
+```
+bash test.sh <CHECKPOINT_PATH>
+```
+## Custom inference
+```
+PYTHONPATH="../":$PYTHONPATH python inference.py --img_path test_img.jpg --ckpt_dir nyu.ckpt
+```

depth/configs/base_options.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# ------------------------------------------------------------------------------
+# The code is from GLPDepth (https://github.com/vinvino02/GLPDepth).
+# For non-commercial purpose only (research, evaluation etc).
+# Modified by Zigang Geng (zigang@mail.ustc.edu.cn).
+# ------------------------------------------------------------------------------
+import argparse
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+class BaseOptions():
+    def __init__(self):
+        pass
+    def initialize(self):
+        parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+        # base configs
+        parser.add_argument('--resume_from',   type=str, default='')
+        parser.add_argument('--exp_name',   type=str, default='')
+        parser.add_argument('--gpu_or_cpu',   type=str, default='gpu')
+        parser.add_argument('--data_path',    type=str, default='/data/ssd1/')
+        parser.add_argument('--dataset',      type=str, default='nyudepthv2',
+                            choices=['nyudepthv2', 'kitti', 'imagepath'])
+        parser.add_argument('--batch_size',   type=int, default=8)
+        parser.add_argument('--workers',      type=int, default=8)
+        # depth configs
+        parser.add_argument('--max_depth',      type=float, default=10.0)
+        parser.add_argument('--max_depth_eval', type=float, default=10.0)
+        parser.add_argument('--min_depth_eval', type=float, default=1e-3)
+        parser.add_argument('--do_kb_crop',     type=int, default=1)
+        parser.add_argument('--kitti_crop', type=str, default=None,
+                            choices=['garg_crop', 'eigen_crop'])
+        parser.add_argument('--pretrained',    type=str, default='')
+        parser.add_argument('--drop_path_rate',     type=float, default=0.3)
+        parser.add_argument('--use_checkpoint',   type=str2bool, default='False')
+        parser.add_argument('--num_deconv',     type=int, default=3)
+        parser.add_argument('--num_filters', nargs='+', type=int, default=[32,32,32])
+        parser.add_argument('--deconv_kernels', nargs='+', type=int, default=[2,2,2])
+        parser.add_argument('--shift_window_test', action='store_true')
+        parser.add_argument('--shift_size',  type=int, default=2)
+        parser.add_argument('--flip_test', action='store_true')
+        return parser

depth/configs/test_options.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# ------------------------------------------------------------------------------
+# The code is from GLPDepth (https://github.com/vinvino02/GLPDepth).
+# For non-commercial purpose only (research, evaluation etc).
+# ------------------------------------------------------------------------------
+from configs.base_options import BaseOptions
+class TestOptions(BaseOptions):
+    def initialize(self):
+        parser = BaseOptions.initialize(self)
+        # experiment configs
+        parser.add_argument('--ckpt_dir',   type=str,
+                    default='./ckpt/best_model_nyu.ckpt',
+                    help='load ckpt path')
+        parser.add_argument('--result_dir', type=str, default='./results',
+                            help='save result images into result_dir/exp_name')
+        parser.add_argument('--crop_h',  type=int, default=448)
+        parser.add_argument('--crop_w',  type=int, default=576)
+        parser.add_argument('--save_eval_pngs', action='store_true',
+                            help='save result image into evaluation form')
+        parser.add_argument('--save_visualize', action='store_true',
+                            help='save result image into visulized form')
+        return parser

depth/configs/train_options.py ADDED Viewed

	@@ -0,0 +1,50 @@

+# ------------------------------------------------------------------------------
+# The code is from GLPDepth (https://github.com/vinvino02/GLPDepth).
+# For non-commercial purpose only (research, evaluation etc).
+# Modified by Zigang Geng (zigang@mail.ustc.edu.cn).
+# ------------------------------------------------------------------------------
+from configs.base_options import BaseOptions
+import argparse
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+class TrainOptions(BaseOptions):
+    def initialize(self):
+        parser = BaseOptions.initialize(self)
+        # experiment configs
+        parser.add_argument('--epochs',      type=int,   default=25)
+        parser.add_argument('--max_lr',          type=float, default=5e-4)
+        parser.add_argument('--min_lr',          type=float, default=3e-5)
+        parser.add_argument('--weight_decay',          type=float, default=5e-2)
+        parser.add_argument('--layer_decay',          type=float, default=0.9)
+        parser.add_argument('--crop_h',  type=int, default=448)
+        parser.add_argument('--crop_w',  type=int, default=576)
+        parser.add_argument('--log_dir', type=str, default='./logs')
+        # logging options
+        parser.add_argument('--val_freq', type=int, default=1)
+        parser.add_argument('--pro_bar', type=str2bool, default='False')
+        parser.add_argument('--save_freq', type=int, default=1)
+        parser.add_argument('--print_freq', type=int, default=100)
+        parser.add_argument('--save_model', action='store_true')
+        parser.add_argument(
+            '--resume-from', help='the checkpoint file to resume from')
+        parser.add_argument('--auto_resume', action='store_true')
+        parser.add_argument('--save_result', action='store_true')
+        return parser

depth/inference.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import os
+import cv2
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+from models_depth.model import EVPDepth
+from configs.train_options import TrainOptions
+from configs.test_options import TestOptions
+import glob
+import utils
+import torchvision.transforms as transforms
+from utils_depth.misc import colorize
+from PIL import Image
+import torch.nn.functional as F
+def main():
+    opt = TestOptions().initialize()
+    opt.add_argument('--img_path',  type=str)
+    args = opt.parse_args()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = EVPDepth(args=args, caption_aggregation=True)
+    cudnn.benchmark = True
+    model.to(device)
+    model_weight = torch.load(args.ckpt_dir)['model']
+    if 'module' in next(iter(model_weight.items()))[0]:
+        model_weight = OrderedDict((k[7:], v) for k, v in model_weight.items())
+    model.load_state_dict(model_weight, strict=False)
+    model.eval()
+    img_path = args.img_path
+    image = cv2.imread(img_path)
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    transform = transforms.ToTensor()
+    image = transform(image).unsqueeze(0).to(device)
+    shape = image.shape
+    image = torch.nn.functional.interpolate(image, (440,480), mode='bilinear', align_corners=True)
+    image = F.pad(image, (0, 0, 40, 0))
+    with torch.no_grad():
+        pred = model(image)['pred_d']
+    pred = pred[:,:,40:,:]
+    pred = torch.nn.functional.interpolate(pred, shape[2:], mode='bilinear', align_corners=True)
+    pred_d_numpy = pred.squeeze().cpu().numpy()
+    pred_d_color, _, _ = colorize(pred_d_numpy, cmap='gray_r')
+    Image.fromarray(pred_d_color).save('res.png')
+    return 0
+if __name__ == '__main__':
+    main()

depth/models_depth/attractor.py ADDED Viewed

	@@ -0,0 +1,208 @@

+# MIT License
+# Copyright (c) 2022 Intelligent Systems Lab Org
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# File author: Shariq Farooq Bhat
+import torch
+import torch.nn as nn
+@torch.jit.script
+def exp_attractor(dx, alpha: float = 300, gamma: int = 2):
+    """Exponential attractor: dc = exp(-alpha*|dx|^gamma) * dx , where dx = a - c, a = attractor point, c = bin center, dc = shift in bin centermmary for exp_attractor
+    Args:
+        dx (torch.Tensor): The difference tensor dx = Ai - Cj, where Ai is the attractor point and Cj is the bin center.
+        alpha (float, optional): Proportional Attractor strength. Determines the absolute strength. Lower alpha = greater attraction. Defaults to 300.
+        gamma (int, optional): Exponential Attractor strength. Determines the "region of influence" and indirectly number of bin centers affected. Lower gamma = farther reach. Defaults to 2.
+    Returns:
+        torch.Tensor : Delta shifts - dc; New bin centers = Old bin centers + dc
+    """
+    return torch.exp(-alpha*(torch.abs(dx)**gamma)) * (dx)
+@torch.jit.script
+def inv_attractor(dx, alpha: float = 300, gamma: int = 2):
+    """Inverse attractor: dc = dx / (1 + alpha*dx^gamma), where dx = a - c, a = attractor point, c = bin center, dc = shift in bin center
+    This is the default one according to the accompanying paper.
+    Args:
+        dx (torch.Tensor): The difference tensor dx = Ai - Cj, where Ai is the attractor point and Cj is the bin center.
+        alpha (float, optional): Proportional Attractor strength. Determines the absolute strength. Lower alpha = greater attraction. Defaults to 300.
+        gamma (int, optional): Exponential Attractor strength. Determines the "region of influence" and indirectly number of bin centers affected. Lower gamma = farther reach. Defaults to 2.
+    Returns:
+        torch.Tensor: Delta shifts - dc; New bin centers = Old bin centers + dc
+    """
+    return dx.div(1+alpha*dx.pow(gamma))
+class AttractorLayer(nn.Module):
+    def __init__(self, in_features, n_bins, n_attractors=16, mlp_dim=128, min_depth=1e-3, max_depth=10,
+                 alpha=300, gamma=2, kind='sum', attractor_type='exp', memory_efficient=False):
+        """
+        Attractor layer for bin centers. Bin centers are bounded on the interval (min_depth, max_depth)
+        """
+        super().__init__()
+        self.n_attractors = n_attractors
+        self.n_bins = n_bins
+        self.min_depth = min_depth
+        self.max_depth = max_depth
+        self.alpha = alpha
+        self.gamma = gamma
+        self.kind = kind
+        self.attractor_type = attractor_type
+        self.memory_efficient = memory_efficient
+        self._net = nn.Sequential(
+            nn.Conv2d(in_features, mlp_dim, 1, 1, 0),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(mlp_dim, n_attractors*2, 1, 1, 0),  # x2 for linear norm
+            nn.ReLU(inplace=True)
+        )
+    def forward(self, x, b_prev, prev_b_embedding=None, interpolate=True, is_for_query=False):
+        """
+        Args:
+            x (torch.Tensor) : feature block; shape - n, c, h, w
+            b_prev (torch.Tensor) : previous bin centers normed; shape - n, prev_nbins, h, w
+        Returns:
+            tuple(torch.Tensor,torch.Tensor) : new bin centers normed and scaled; shape - n, nbins, h, w
+        """
+        if prev_b_embedding is not None:
+            if interpolate:
+                prev_b_embedding = nn.functional.interpolate(
+                    prev_b_embedding, x.shape[-2:], mode='bilinear', align_corners=True)
+            x = x + prev_b_embedding
+        A = self._net(x)
+        eps = 1e-3
+        A = A + eps
+        n, c, h, w = A.shape
+        A = A.view(n, self.n_attractors, 2, h, w)
+        A_normed = A / A.sum(dim=2, keepdim=True)  # n, a, 2, h, w
+        A_normed = A[:, :, 0, ...]  # n, na, h, w
+        b_prev = nn.functional.interpolate(
+            b_prev, (h, w), mode='bilinear', align_corners=True)
+        b_centers = b_prev
+        if self.attractor_type == 'exp':
+            dist = exp_attractor
+        else:
+            dist = inv_attractor
+        if not self.memory_efficient:
+            func = {'mean': torch.mean, 'sum': torch.sum}[self.kind]
+            # .shape N, nbins, h, w
+            delta_c = func(dist(A_normed.unsqueeze(
+                2) - b_centers.unsqueeze(1)), dim=1)
+        else:
+            delta_c = torch.zeros_like(b_centers, device=b_centers.device)
+            for i in range(self.n_attractors):
+                # .shape N, nbins, h, w
+                delta_c += dist(A_normed[:, i, ...].unsqueeze(1) - b_centers)
+            if self.kind == 'mean':
+                delta_c = delta_c / self.n_attractors
+        b_new_centers = b_centers + delta_c
+        B_centers = (self.max_depth - self.min_depth) * \
+            b_new_centers + self.min_depth
+        B_centers, _ = torch.sort(B_centers, dim=1)
+        B_centers = torch.clip(B_centers, self.min_depth, self.max_depth)
+        return b_new_centers, B_centers
+class AttractorLayerUnnormed(nn.Module):
+    def __init__(self, in_features, n_bins, n_attractors=16, mlp_dim=128, min_depth=1e-3, max_depth=10,
+                 alpha=300, gamma=2, kind='sum', attractor_type='exp', memory_efficient=False):
+        """
+        Attractor layer for bin centers. Bin centers are unbounded
+        """
+        super().__init__()
+        self.n_attractors = n_attractors
+        self.n_bins = n_bins
+        self.min_depth = min_depth
+        self.max_depth = max_depth
+        self.alpha = alpha
+        self.gamma = gamma
+        self.kind = kind
+        self.attractor_type = attractor_type
+        self.memory_efficient = memory_efficient
+        self._net = nn.Sequential(
+            nn.Conv2d(in_features, mlp_dim, 1, 1, 0),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(mlp_dim, n_attractors, 1, 1, 0),
+            nn.Softplus()
+        )
+    def forward(self, x, b_prev, prev_b_embedding=None, interpolate=True, is_for_query=False):
+        """
+        Args:
+            x (torch.Tensor) : feature block; shape - n, c, h, w
+            b_prev (torch.Tensor) : previous bin centers normed; shape - n, prev_nbins, h, w
+        Returns:
+            tuple(torch.Tensor,torch.Tensor) : new bin centers unbounded; shape - n, nbins, h, w. Two outputs just to keep the API consistent with the normed version
+        """
+        if prev_b_embedding is not None:
+            if interpolate:
+                prev_b_embedding = nn.functional.interpolate(
+                    prev_b_embedding, x.shape[-2:], mode='bilinear', align_corners=True)
+            x = x + prev_b_embedding
+        A = self._net(x)
+        n, c, h, w = A.shape
+        b_prev = nn.functional.interpolate(
+            b_prev, (h, w), mode='bilinear', align_corners=True)
+        b_centers = b_prev
+        if self.attractor_type == 'exp':
+            dist = exp_attractor
+        else:
+            dist = inv_attractor
+        if not self.memory_efficient:
+            func = {'mean': torch.mean, 'sum': torch.sum}[self.kind]
+            # .shape N, nbins, h, w
+            delta_c = func(
+                dist(A.unsqueeze(2) - b_centers.unsqueeze(1)), dim=1)
+        else:
+            delta_c = torch.zeros_like(b_centers, device=b_centers.device)
+            for i in range(self.n_attractors):
+                delta_c += dist(A[:, i, ...].unsqueeze(1) -
+                                b_centers)  # .shape N, nbins, h, w
+            if self.kind == 'mean':
+                delta_c = delta_c / self.n_attractors
+        b_new_centers = b_centers + delta_c
+        B_centers = b_new_centers
+        return b_new_centers, B_centers

depth/models_depth/checkpoint.py ADDED Viewed

	@@ -0,0 +1,608 @@

+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# The code is from Swin Transformer.
+# (https://github.com/SwinTransformer/Swin-Transformer-Object-Detection/blob/master/mmcv_custom/checkpoint.py)
+# ------------------------------------------------------------------------------
+import io
+import os
+import os.path as osp
+import pkgutil
+import time
+import warnings
+import numpy as np
+from collections import OrderedDict
+from importlib import import_module
+from tempfile import TemporaryDirectory
+from scipy import interpolate
+import torch
+import torchvision
+import torch.distributed as dist
+from torch.optim import Optimizer
+from torch.utils import model_zoo
+from torch.nn import functional as F
+import mmcv
+from mmcv.fileio import FileClient
+from mmcv.fileio import load as load_file
+from mmcv.parallel import is_module_wrapper
+from mmcv.utils import mkdir_or_exist
+from mmcv.runner import get_dist_info
+from mmcv.utils import get_logger
+import logging
+def get_root_logger(log_file=None, log_level=logging.INFO):
+    """Get the root logger.
+    The logger will be initialized if it has not been initialized. By default a
+    StreamHandler will be added. If `log_file` is specified, a FileHandler will
+    also be added. The name of the root logger is the top-level package name,
+    e.g., "mmseg".
+    Args:
+        log_file (str | None): The log filename. If specified, a FileHandler
+            will be added to the root logger.
+        log_level (int): The root logger level. Note that only the process of
+            rank 0 is affected, while other processes will set the level to
+            "Error" and be silent most of the time.
+    Returns:
+        logging.Logger: The root logger.
+    """
+    logger = get_logger(name='mmpose', log_file=log_file, log_level=log_level)
+    return logger
+def _get_mmcv_home():
+    mmcv_home = os.path.expanduser(
+        os.getenv(
+            ENV_MMCV_HOME,
+            os.path.join(
+                os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), 'mmcv')))
+    mkdir_or_exist(mmcv_home)
+    return mmcv_home
+def load_state_dict(module, state_dict, strict=False, logger=None):
+    """Load state_dict to a module.
+    This method is modified from :meth:`torch.nn.Module.load_state_dict`.
+    Default value for ``strict`` is set to ``False`` and the message for
+    param mismatch will be shown even if strict is False.
+    Args:
+        module (Module): Module that receives the state_dict.
+        state_dict (OrderedDict): Weights.
+        strict (bool): whether to strictly enforce that the keys
+            in :attr:`state_dict` match the keys returned by this module's
+            :meth:`~torch.nn.Module.state_dict` function. Default: ``False``.
+        logger (:obj:`logging.Logger`, optional): Logger to log the error
+            message. If not specified, print function will be used.
+    """
+    unexpected_keys = []
+    all_missing_keys = []
+    err_msg = []
+    metadata = getattr(state_dict, '_metadata', None)
+    state_dict = state_dict.copy()
+    if metadata is not None:
+        state_dict._metadata = metadata
+    # use _load_from_state_dict to enable checkpoint version control
+    def load(module, prefix=''):
+        # recursively check parallel module in case that the model has a
+        # complicated structure, e.g., nn.Module(nn.Module(DDP))
+        if is_module_wrapper(module):
+            module = module.module
+        local_metadata = {} if metadata is None else metadata.get(
+            prefix[:-1], {})
+        module._load_from_state_dict(state_dict, prefix, local_metadata, True,
+                                     all_missing_keys, unexpected_keys,
+                                     err_msg)
+        for name, child in module._modules.items():
+            if child is not None:
+                load(child, prefix + name + '.')
+    load(module)
+    load = None  # break load->load reference cycle
+    # ignore "num_batches_tracked" of BN layers
+    missing_keys = [
+        key for key in all_missing_keys if 'num_batches_tracked' not in key
+    ]
+    if unexpected_keys:
+        err_msg.append('unexpected key in source '
+                       f'state_dict: {", ".join(unexpected_keys)}\n')
+    if missing_keys:
+        err_msg.append(
+            f'missing keys in source state_dict: {", ".join(missing_keys)}\n')
+    rank, _ = get_dist_info()
+    if len(err_msg) > 0 and rank == 0:
+        err_msg.insert(
+            0, 'The model and loaded state dict do not match exactly\n')
+        err_msg = '\n'.join(err_msg)
+        if strict:
+            raise RuntimeError(err_msg)
+        elif logger is not None:
+            logger.warning(err_msg)
+        else:
+            print(err_msg)
+def load_url_dist(url, model_dir=None):
+    """In distributed setting, this function only download checkpoint at local
+    rank 0."""
+    rank, world_size = get_dist_info()
+    rank = int(os.environ.get('LOCAL_RANK', rank))
+    if rank == 0:
+        checkpoint = model_zoo.load_url(url, model_dir=model_dir)
+    if world_size > 1:
+        torch.distributed.barrier()
+        if rank > 0:
+            checkpoint = model_zoo.load_url(url, model_dir=model_dir)
+    return checkpoint
+def load_pavimodel_dist(model_path, map_location=None):
+    """In distributed setting, this function only download checkpoint at local
+    rank 0."""
+    try:
+        from pavi import modelcloud
+    except ImportError:
+        raise ImportError(
+            'Please install pavi to load checkpoint from modelcloud.')
+    rank, world_size = get_dist_info()
+    rank = int(os.environ.get('LOCAL_RANK', rank))
+    if rank == 0:
+        model = modelcloud.get(model_path)
+        with TemporaryDirectory() as tmp_dir:
+            downloaded_file = osp.join(tmp_dir, model.name)
+            model.download(downloaded_file)
+            checkpoint = torch.load(downloaded_file, map_location=map_location)
+    if world_size > 1:
+        torch.distributed.barrier()
+        if rank > 0:
+            model = modelcloud.get(model_path)
+            with TemporaryDirectory() as tmp_dir:
+                downloaded_file = osp.join(tmp_dir, model.name)
+                model.download(downloaded_file)
+                checkpoint = torch.load(
+                    downloaded_file, map_location=map_location)
+    return checkpoint
+def load_fileclient_dist(filename, backend, map_location):
+    """In distributed setting, this function only download checkpoint at local
+    rank 0."""
+    rank, world_size = get_dist_info()
+    rank = int(os.environ.get('LOCAL_RANK', rank))
+    allowed_backends = ['ceph']
+    if backend not in allowed_backends:
+        raise ValueError(f'Load from Backend {backend} is not supported.')
+    if rank == 0:
+        fileclient = FileClient(backend=backend)
+        buffer = io.BytesIO(fileclient.get(filename))
+        checkpoint = torch.load(buffer, map_location=map_location)
+    if world_size > 1:
+        torch.distributed.barrier()
+        if rank > 0:
+            fileclient = FileClient(backend=backend)
+            buffer = io.BytesIO(fileclient.get(filename))
+            checkpoint = torch.load(buffer, map_location=map_location)
+    return checkpoint
+def get_torchvision_models():
+    model_urls = dict()
+    for _, name, ispkg in pkgutil.walk_packages(torchvision.models.__path__):
+        if ispkg:
+            continue
+        _zoo = import_module(f'torchvision.models.{name}')
+        if hasattr(_zoo, 'model_urls'):
+            _urls = getattr(_zoo, 'model_urls')
+            model_urls.update(_urls)
+    return model_urls
+def get_external_models():
+    mmcv_home = _get_mmcv_home()
+    default_json_path = osp.join(mmcv.__path__[0], 'model_zoo/open_mmlab.json')
+    default_urls = load_file(default_json_path)
+    assert isinstance(default_urls, dict)
+    external_json_path = osp.join(mmcv_home, 'open_mmlab.json')
+    if osp.exists(external_json_path):
+        external_urls = load_file(external_json_path)
+        assert isinstance(external_urls, dict)
+        default_urls.update(external_urls)
+    return default_urls
+def get_mmcls_models():
+    mmcls_json_path = osp.join(mmcv.__path__[0], 'model_zoo/mmcls.json')
+    mmcls_urls = load_file(mmcls_json_path)
+    return mmcls_urls
+def get_deprecated_model_names():
+    deprecate_json_path = osp.join(mmcv.__path__[0],
+                                   'model_zoo/deprecated.json')
+    deprecate_urls = load_file(deprecate_json_path)
+    assert isinstance(deprecate_urls, dict)
+    return deprecate_urls
+def _process_mmcls_checkpoint(checkpoint):
+    state_dict = checkpoint['state_dict']
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        if k.startswith('backbone.'):
+            new_state_dict[k[9:]] = v
+    new_checkpoint = dict(state_dict=new_state_dict)
+    return new_checkpoint
+def _load_checkpoint(filename, map_location=None):
+    """Load checkpoint from somewhere (modelzoo, file, url).
+    Args:
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str | None): Same as :func:`torch.load`. Default: None.
+    Returns:
+        dict | OrderedDict: The loaded checkpoint. It can be either an
+            OrderedDict storing model weights or a dict containing other
+            information, which depends on the checkpoint.
+    """
+    if filename.startswith('modelzoo://'):
+        warnings.warn('The URL scheme of "modelzoo://" is deprecated, please '
+                      'use "torchvision://" instead')
+        model_urls = get_torchvision_models()
+        model_name = filename[11:]
+        checkpoint = load_url_dist(model_urls[model_name])
+    elif filename.startswith('torchvision://'):
+        model_urls = get_torchvision_models()
+        model_name = filename[14:]
+        checkpoint = load_url_dist(model_urls[model_name])
+    elif filename.startswith('open-mmlab://'):
+        model_urls = get_external_models()
+        model_name = filename[13:]
+        deprecated_urls = get_deprecated_model_names()
+        if model_name in deprecated_urls:
+            warnings.warn(f'open-mmlab://{model_name} is deprecated in favor '
+                          f'of open-mmlab://{deprecated_urls[model_name]}')
+            model_name = deprecated_urls[model_name]
+        model_url = model_urls[model_name]
+        # check if is url
+        if model_url.startswith(('http://', 'https://')):
+            checkpoint = load_url_dist(model_url)
+        else:
+            filename = osp.join(_get_mmcv_home(), model_url)
+            if not osp.isfile(filename):
+                raise IOError(f'{filename} is not a checkpoint file')
+            checkpoint = torch.load(filename, map_location=map_location)
+    elif filename.startswith('mmcls://'):
+        model_urls = get_mmcls_models()
+        model_name = filename[8:]
+        checkpoint = load_url_dist(model_urls[model_name])
+        checkpoint = _process_mmcls_checkpoint(checkpoint)
+    elif filename.startswith(('http://', 'https://')):
+        checkpoint = load_url_dist(filename)
+    elif filename.startswith('pavi://'):
+        model_path = filename[7:]
+        checkpoint = load_pavimodel_dist(model_path, map_location=map_location)
+    elif filename.startswith('s3://'):
+        checkpoint = load_fileclient_dist(
+            filename, backend='ceph', map_location=map_location)
+    else:
+        if not osp.isfile(filename):
+            raise IOError(f'{filename} is not a checkpoint file')
+        checkpoint = torch.load(filename, map_location=map_location)
+    return checkpoint
+def load_checkpoint_swin(model,
+                         filename,
+                         map_location='cpu',
+                         strict=False,
+                         rpe_interpolation='outer_mask',
+                         logger=None):
+    """Load checkpoint from a file or URI.
+    Args:
+        model (Module): Module to load checkpoint.
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str): Same as :func:`torch.load`.
+        strict (bool): Whether to allow different params for the model and
+            checkpoint.
+        logger (:mod:`logging.Logger` or None): The logger for error message.
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    checkpoint = _load_checkpoint(filename, map_location)
+    # OrderedDict is a subclass of dict
+    if not isinstance(checkpoint, dict):
+        raise RuntimeError(
+            f'No state_dict found in checkpoint file {filename}')
+    # get state_dict from checkpoint
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    elif 'model' in checkpoint:
+        state_dict = checkpoint['model']
+    elif 'module' in checkpoint:
+        state_dict = checkpoint['module']
+    else:
+        state_dict = checkpoint
+    # strip prefix of state_dict
+    if list(state_dict.keys())[0].startswith('module.'):
+        state_dict = {k[7:]: v for k, v in state_dict.items()}
+    # for MoBY, load model of online branch
+    if sorted(list(state_dict.keys()))[2].startswith('encoder'):
+        state_dict = {k.replace('encoder.', ''): v for k, v in state_dict.items() if k.startswith('encoder.')}
+    # reshape absolute position embedding for Swin
+    if state_dict.get('absolute_pos_embed') is not None:
+        absolute_pos_embed = state_dict['absolute_pos_embed']
+        N1, L, C1 = absolute_pos_embed.size()
+        N2, C2, H, W = model.absolute_pos_embed.size()
+        if N1 != N2 or C1 != C2 or L != H * W:
+            logger.warning("Error in loading absolute_pos_embed, pass")
+        else:
+            state_dict['absolute_pos_embed'] = absolute_pos_embed.view(N2, H, W, C2).permute(0, 3, 1, 2)
+    # interpolate position bias table if needed
+    relative_position_bias_table_keys = [k for k in state_dict.keys() if "relative_position_bias_table" in k]
+    for k in relative_position_bias_table_keys:
+        table_pretrained = state_dict[k]
+        table_current = model.state_dict()[k]
+        L1, nH1 = table_pretrained.size()
+        L2, nH2 = table_current.size()
+        if nH1 != nH2:
+            logger.warning(f"Error in loading {k}, pass")
+        else:
+            if L1 != L2:
+                if rpe_interpolation in ['bicubic', 'bilinear', 'nearest']:
+                    logger.info(f"Interpolate relative_position_bias_table using {rpe_interpolation}")
+                    S1 = int(L1 ** 0.5)
+                    S2 = int(L2 ** 0.5)
+                    table_pretrained_resized = F.interpolate(
+                        table_pretrained.permute(1, 0).view(1, nH1, S1, S1),
+                        size=(S2, S2), mode=rpe_interpolation)
+                    state_dict[k] = table_pretrained_resized.view(nH2, L2).permute(1, 0)
+                elif rpe_interpolation == 'geo':
+                    logger.info("Interpolate relative_position_bias_table using geo.")
+                    src_size = int(L1 ** 0.5)
+                    dst_size = int(L2 ** 0.5)
+                    def geometric_progression(a, r, n):
+                        return a * (1.0 - r ** n) / (1.0 - r)
+                    left, right = 1.01, 1.5
+                    while right - left > 1e-6:
+                        q = (left + right) / 2.0
+                        gp = geometric_progression(1, q, src_size // 2)
+                        if gp > dst_size // 2:
+                            right = q
+                        else:
+                            left = q
+                    # if q > 1.13492:
+                    #     q = 1.13492
+                    dis = []
+                    cur = 1
+                    for i in range(src_size // 2):
+                        dis.append(cur)
+                        cur += q ** (i + 1)
+                    r_ids = [-_ for _ in reversed(dis)]
+                    x = r_ids + [0] + dis
+                    y = r_ids + [0] + dis
+                    t = dst_size // 2.0
+                    dx = np.arange(-t, t + 0.1, 1.0)
+                    dy = np.arange(-t, t + 0.1, 1.0)
+                    logger.info("Original positions = %s" % str(x))
+                    logger.info("Target positions = %s" % str(dx))
+                    all_rel_pos_bias = []
+                    for i in range(nH1):
+                        z = table_pretrained[:, i].view(src_size, src_size).float().numpy()
+                        f_cubic = interpolate.interp2d(x, y, z, kind='cubic')
+                        all_rel_pos_bias.append(torch.Tensor(f_cubic(dx, dy)).contiguous().view(-1, 1).to(
+                            table_pretrained.device))
+                    new_rel_pos_bias = torch.cat(all_rel_pos_bias, dim=-1)
+                    state_dict[k] = new_rel_pos_bias
+    if 'pos_embed' in state_dict:
+        pos_embed_checkpoint = state_dict['pos_embed']
+        embedding_size = pos_embed_checkpoint.shape[-1]
+        num_patches = model.patch_embed.num_patches
+        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+        # height (== width) for the checkpoint position embedding
+        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+        # height (== width) for the new position embedding
+        new_size = int(num_patches ** 0.5)
+        # class_token and dist_token are kept unchanged
+        if orig_size != new_size:
+            if dist.get_rank() == 0:
+                print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size))
+            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+            # only the position tokens are interpolated
+            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+            pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
+            pos_tokens = torch.nn.functional.interpolate(
+                pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
+            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+            new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+            state_dict['pos_embed'] = new_pos_embed
+    # load state_dict
+    load_state_dict(model, state_dict, strict, logger)
+    return checkpoint
+def weights_to_cpu(state_dict):
+    """Copy a model state_dict to cpu.
+    Args:
+        state_dict (OrderedDict): Model weights on GPU.
+    Returns:
+        OrderedDict: Model weights on GPU.
+    """
+    state_dict_cpu = OrderedDict()
+    for key, val in state_dict.items():
+        state_dict_cpu[key] = val.cpu()
+    return state_dict_cpu
+def _save_to_state_dict(module, destination, prefix, keep_vars):
+    """Saves module state to `destination` dictionary.
+    This method is modified from :meth:`torch.nn.Module._save_to_state_dict`.
+    Args:
+        module (nn.Module): The module to generate state_dict.
+        destination (dict): A dict where state will be stored.
+        prefix (str): The prefix for parameters and buffers used in this
+            module.
+    """
+    for name, param in module._parameters.items():
+        if param is not None:
+            destination[prefix + name] = param if keep_vars else param.detach()
+    for name, buf in module._buffers.items():
+        # remove check of _non_persistent_buffers_set to allow nn.BatchNorm2d
+        if buf is not None:
+            destination[prefix + name] = buf if keep_vars else buf.detach()
+def get_state_dict(module, destination=None, prefix='', keep_vars=False):
+    """Returns a dictionary containing a whole state of the module.
+    Both parameters and persistent buffers (e.g. running averages) are
+    included. Keys are corresponding parameter and buffer names.
+    This method is modified from :meth:`torch.nn.Module.state_dict` to
+    recursively check parallel module in case that the model has a complicated
+    structure, e.g., nn.Module(nn.Module(DDP)).
+    Args:
+        module (nn.Module): The module to generate state_dict.
+        destination (OrderedDict): Returned dict for the state of the
+            module.
+        prefix (str): Prefix of the key.
+        keep_vars (bool): Whether to keep the variable property of the
+            parameters. Default: False.
+    Returns:
+        dict: A dictionary containing a whole state of the module.
+    """
+    # recursively check parallel module in case that the model has a
+    # complicated structure, e.g., nn.Module(nn.Module(DDP))
+    if is_module_wrapper(module):
+        module = module.module
+    # below is the same as torch.nn.Module.state_dict()
+    if destination is None:
+        destination = OrderedDict()
+        destination._metadata = OrderedDict()
+    destination._metadata[prefix[:-1]] = local_metadata = dict(
+        version=module._version)
+    _save_to_state_dict(module, destination, prefix, keep_vars)
+    for name, child in module._modules.items():
+        if child is not None:
+            get_state_dict(
+                child, destination, prefix + name + '.', keep_vars=keep_vars)
+    for hook in module._state_dict_hooks.values():
+        hook_result = hook(module, destination, prefix, local_metadata)
+        if hook_result is not None:
+            destination = hook_result
+    return destination
+def save_checkpoint(model, filename, optimizer=None, meta=None):
+    """Save checkpoint to file.
+    The checkpoint will have 3 fields: ``meta``, ``state_dict`` and
+    ``optimizer``. By default ``meta`` will contain version and time info.
+    Args:
+        model (Module): Module whose params are to be saved.
+        filename (str): Checkpoint filename.
+        optimizer (:obj:`Optimizer`, optional): Optimizer to be saved.
+        meta (dict, optional): Metadata to be saved in checkpoint.
+    """
+    if meta is None:
+        meta = {}
+    elif not isinstance(meta, dict):
+        raise TypeError(f'meta must be a dict or None, but got {type(meta)}')
+    meta.update(mmcv_version=mmcv.__version__, time=time.asctime())
+    if is_module_wrapper(model):
+        model = model.module
+    if hasattr(model, 'CLASSES') and model.CLASSES is not None:
+        # save class name to the meta
+        meta.update(CLASSES=model.CLASSES)
+    checkpoint = {
+        'meta': meta,
+        'state_dict': weights_to_cpu(get_state_dict(model))
+    }
+    # save optimizer state dict in the checkpoint
+    if isinstance(optimizer, Optimizer):
+        checkpoint['optimizer'] = optimizer.state_dict()
+    elif isinstance(optimizer, dict):
+        checkpoint['optimizer'] = {}
+        for name, optim in optimizer.items():
+            checkpoint['optimizer'][name] = optim.state_dict()
+    if filename.startswith('pavi://'):
+        try:
+            from pavi import modelcloud
+            from pavi.exception import NodeNotFoundError
+        except ImportError:
+            raise ImportError(
+                'Please install pavi to load checkpoint from modelcloud.')
+        model_path = filename[7:]
+        root = modelcloud.Folder()
+        model_dir, model_name = osp.split(model_path)
+        try:
+            model = modelcloud.get(model_dir)
+        except NodeNotFoundError:
+            model = root.create_training_model(model_dir)
+        with TemporaryDirectory() as tmp_dir:
+            checkpoint_file = osp.join(tmp_dir, model_name)
+            with open(checkpoint_file, 'wb') as f:
+                torch.save(checkpoint, f)
+                f.flush()
+            model.create_file(checkpoint_file, name=model_name)
+    else:
+        mmcv.mkdir_or_exist(osp.dirname(filename))
+        # immediately flush buffer
+        with open(filename, 'wb') as f:
+            torch.save(checkpoint, f)
+            f.flush()

depth/models_depth/dist_layers.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# MIT License
+# Copyright (c) 2022 Intelligent Systems Lab Org
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# File author: Shariq Farooq Bhat
+import torch
+import torch.nn as nn
+def log_binom(n, k, eps=1e-7):
+    """ log(nCk) using stirling approximation """
+    n = n + eps
+    k = k + eps
+    return n * torch.log(n) - k * torch.log(k) - (n-k) * torch.log(n-k+eps)
+class LogBinomial(nn.Module):
+    def __init__(self, n_classes=256, act=torch.softmax):
+        """Compute log binomial distribution for n_classes
+        Args:
+            n_classes (int, optional): number of output classes. Defaults to 256.
+        """
+        super().__init__()
+        self.K = n_classes
+        self.act = act
+        self.register_buffer('k_idx', torch.arange(
+            0, n_classes).view(1, -1, 1, 1))
+        self.register_buffer('K_minus_1', torch.Tensor(
+            [self.K-1]).view(1, -1, 1, 1))
+    def forward(self, x, t=1., eps=1e-4):
+        """Compute log binomial distribution for x
+        Args:
+            x (torch.Tensor - NCHW): probabilities
+            t (float, torch.Tensor - NCHW, optional): Temperature of distribution. Defaults to 1..
+            eps (float, optional): Small number for numerical stability. Defaults to 1e-4.
+        Returns:
+            torch.Tensor -NCHW: log binomial distribution logbinomial(p;t)
+        """
+        if x.ndim == 3:
+            x = x.unsqueeze(1)  # make it nchw
+        one_minus_x = torch.clamp(1 - x, eps, 1)
+        x = torch.clamp(x, eps, 1)
+        y = log_binom(self.K_minus_1, self.k_idx) + self.k_idx * \
+            torch.log(x) + (self.K - 1 - self.k_idx) * torch.log(one_minus_x)
+        return self.act(y/t, dim=1)
+class ConditionalLogBinomial(nn.Module):
+    def __init__(self, in_features, condition_dim, n_classes=256, bottleneck_factor=2, p_eps=1e-4, max_temp=50, min_temp=1e-7, act=torch.softmax):
+        """Conditional Log Binomial distribution
+        Args:
+            in_features (int): number of input channels in main feature
+            condition_dim (int): number of input channels in condition feature
+            n_classes (int, optional): Number of classes. Defaults to 256.
+            bottleneck_factor (int, optional): Hidden dim factor. Defaults to 2.
+            p_eps (float, optional): small eps value. Defaults to 1e-4.
+            max_temp (float, optional): Maximum temperature of output distribution. Defaults to 50.
+            min_temp (float, optional): Minimum temperature of output distribution. Defaults to 1e-7.
+        """
+        super().__init__()
+        self.p_eps = p_eps
+        self.max_temp = max_temp
+        self.min_temp = min_temp
+        self.log_binomial_transform = LogBinomial(n_classes, act=act)
+        bottleneck = (in_features + condition_dim) // bottleneck_factor
+        self.mlp = nn.Sequential(
+            nn.Conv2d(in_features + condition_dim, bottleneck,
+                      kernel_size=1, stride=1, padding=0),
+            nn.GELU(),
+            # 2 for p linear norm, 2 for t linear norm
+            nn.Conv2d(bottleneck, 2+2, kernel_size=1, stride=1, padding=0),
+            nn.Softplus()
+        )
+    def forward(self, x, cond):
+        """Forward pass
+        Args:
+            x (torch.Tensor - NCHW): Main feature
+            cond (torch.Tensor - NCHW): condition feature
+        Returns:
+            torch.Tensor: Output log binomial distribution
+        """
+        pt = self.mlp(torch.concat((x, cond), dim=1))
+        p, t = pt[:, :2, ...], pt[:, 2:, ...]
+        p = p + self.p_eps
+        p = p[:, 0, ...] / (p[:, 0, ...] + p[:, 1, ...])
+        t = t + self.p_eps
+        t = t[:, 0, ...] / (t[:, 0, ...] + t[:, 1, ...])
+        t = t.unsqueeze(1)
+        t = (self.max_temp - self.min_temp) * t + self.min_temp
+        return self.log_binomial_transform(p, t)

depth/models_depth/layers.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import torch
+import torch.nn as nn
+class PatchTransformerEncoder(nn.Module):
+    def __init__(self, in_channels, patch_size=10, embedding_dim=128, num_heads=4):
+        super(PatchTransformerEncoder, self).__init__()
+        encoder_layers = nn.TransformerEncoderLayer(embedding_dim, num_heads, dim_feedforward=1024)
+        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=4)  # takes shape S,N,E
+        self.embedding_convPxP = nn.Conv2d(in_channels, embedding_dim,
+                                           kernel_size=patch_size, stride=patch_size, padding=0)
+        self.positional_encodings = nn.Parameter(torch.rand(900, embedding_dim), requires_grad=True)
+    def forward(self, x):
+        embeddings = self.embedding_convPxP(x).flatten(2)  # .shape = n,c,s = n, embedding_dim, s
+        # embeddings = nn.functional.pad(embeddings, (1,0))  # extra special token at start ?
+        embeddings = embeddings + self.positional_encodings[:embeddings.shape[2], :].T.unsqueeze(0)
+        # change to S,N,E format required by transformer
+        embeddings = embeddings.permute(2, 0, 1)
+        x = self.transformer_encoder(embeddings)  # .shape = S, N, E
+        return x
+class PixelWiseDotProduct(nn.Module):
+    def __init__(self):
+        super(PixelWiseDotProduct, self).__init__()
+    def forward(self, x, K):
+        n, c, h, w = x.size()
+        _, cout, ck = K.size()
+        assert c == ck, "Number of channels in x and Embedding dimension (at dim 2) of K matrix must match"
+        y = torch.matmul(x.view(n, c, h * w).permute(0, 2, 1), K.permute(0, 2, 1))  # .shape = n, hw, cout
+        return y.permute(0, 2, 1).view(n, cout, h, w)

depth/models_depth/localbins_layers.py ADDED Viewed

	@@ -0,0 +1,169 @@

+# MIT License
+# Copyright (c) 2022 Intelligent Systems Lab Org
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# File author: Shariq Farooq Bhat
+import torch
+import torch.nn as nn
+class SeedBinRegressor(nn.Module):
+    def __init__(self, in_features, n_bins=16, mlp_dim=256, min_depth=1e-3, max_depth=10):
+        """Bin center regressor network. Bin centers are bounded on (min_depth, max_depth) interval.
+        Args:
+            in_features (int): input channels
+            n_bins (int, optional): Number of bin centers. Defaults to 16.
+            mlp_dim (int, optional): Hidden dimension. Defaults to 256.
+            min_depth (float, optional): Min depth value. Defaults to 1e-3.
+            max_depth (float, optional): Max depth value. Defaults to 10.
+        """
+        super().__init__()
+        self.version = "1_1"
+        self.min_depth = min_depth
+        self.max_depth = max_depth
+        self._net = nn.Sequential(
+            nn.Conv2d(in_features, mlp_dim, 1, 1, 0),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(mlp_dim, n_bins, 1, 1, 0),
+            nn.ReLU(inplace=True)
+        )
+    def forward(self, x):
+        """
+        Returns tensor of bin_width vectors (centers). One vector b for every pixel
+        """
+        B = self._net(x)
+        eps = 1e-3
+        B = B + eps
+        B_widths_normed = B / B.sum(dim=1, keepdim=True)
+        B_widths = (self.max_depth - self.min_depth) * \
+            B_widths_normed  # .shape NCHW
+        # pad has the form (left, right, top, bottom, front, back)
+        B_widths = nn.functional.pad(
+            B_widths, (0, 0, 0, 0, 1, 0), mode='constant', value=self.min_depth)
+        B_edges = torch.cumsum(B_widths, dim=1)  # .shape NCHW
+        B_centers = 0.5 * (B_edges[:, :-1, ...] + B_edges[:, 1:, ...])
+        return B_widths_normed, B_centers
+class SeedBinRegressorUnnormed(nn.Module):
+    def __init__(self, in_features, n_bins=16, mlp_dim=256, min_depth=1e-3, max_depth=10):
+        """Bin center regressor network. Bin centers are unbounded
+        Args:
+            in_features (int): input channels
+            n_bins (int, optional): Number of bin centers. Defaults to 16.
+            mlp_dim (int, optional): Hidden dimension. Defaults to 256.
+            min_depth (float, optional): Not used. (for compatibility with SeedBinRegressor)
+            max_depth (float, optional): Not used. (for compatibility with SeedBinRegressor)
+        """
+        super().__init__()
+        self.version = "1_1"
+        self._net = nn.Sequential(
+            nn.Conv2d(in_features, mlp_dim, 1, 1, 0),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(mlp_dim, n_bins, 1, 1, 0),
+            nn.Softplus()
+        )
+    def forward(self, x):
+        """
+        Returns tensor of bin_width vectors (centers). One vector b for every pixel
+        """
+        B_centers = self._net(x)
+        return B_centers, B_centers
+class Projector(nn.Module):
+    def __init__(self, in_features, out_features, mlp_dim=128):
+        """Projector MLP
+        Args:
+            in_features (int): input channels
+            out_features (int): output channels
+            mlp_dim (int, optional): hidden dimension. Defaults to 128.
+        """
+        super().__init__()
+        self._net = nn.Sequential(
+            nn.Conv2d(in_features, mlp_dim, 1, 1, 0),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(mlp_dim, out_features, 1, 1, 0),
+        )
+    def forward(self, x):
+        return self._net(x)
+class LinearSplitter(nn.Module):
+    def __init__(self, in_features, prev_nbins, split_factor=2, mlp_dim=128, min_depth=1e-3, max_depth=10):
+        super().__init__()
+        self.prev_nbins = prev_nbins
+        self.split_factor = split_factor
+        self.min_depth = min_depth
+        self.max_depth = max_depth
+        self._net = nn.Sequential(
+            nn.Conv2d(in_features, mlp_dim, 1, 1, 0),
+            nn.GELU(),
+            nn.Conv2d(mlp_dim, prev_nbins * split_factor, 1, 1, 0),
+            nn.ReLU()
+        )
+    def forward(self, x, b_prev, prev_b_embedding=None, interpolate=True, is_for_query=False):
+        """
+        x : feature block; shape - n, c, h, w
+        b_prev : previous bin widths normed; shape - n, prev_nbins, h, w
+        """
+        if prev_b_embedding is not None:
+            if interpolate:
+                prev_b_embedding = nn.functional.interpolate(prev_b_embedding, x.shape[-2:], mode='bilinear', align_corners=True)
+            x = x + prev_b_embedding
+        S = self._net(x)
+        eps = 1e-3
+        S = S + eps
+        n, c, h, w = S.shape
+        S = S.view(n, self.prev_nbins, self.split_factor, h, w)
+        S_normed = S / S.sum(dim=2, keepdim=True)  # fractional splits
+        b_prev = nn.functional.interpolate(b_prev, (h,w), mode='bilinear', align_corners=True)
+        b_prev = b_prev / b_prev.sum(dim=1, keepdim=True)  # renormalize for gurantees
+        # print(b_prev.shape, S_normed.shape)
+        # if is_for_query:(1).expand(-1, b_prev.size(0)//n, -1, -1, -1, -1).flatten(0,1)  # TODO ? can replace all this with a single torch.repeat?
+        b = b_prev.unsqueeze(2) * S_normed
+        b = b.flatten(1,2)  # .shape n, prev_nbins * split_factor, h, w
+        # calculate bin centers for loss calculation
+        B_widths = (self.max_depth - self.min_depth) * b  # .shape N, nprev * splitfactor, H, W
+        # pad has the form (left, right, top, bottom, front, back)
+        B_widths = nn.functional.pad(B_widths, (0,0,0,0,1,0), mode='constant', value=self.min_depth)
+        B_edges = torch.cumsum(B_widths, dim=1)  # .shape NCHW
+        B_centers = 0.5 * (B_edges[:, :-1, ...] + B_edges[:,1:,...])
+        return b, B_centers

depth/models_depth/miniViT.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import torch
+import torch.nn as nn
+from .layers import PatchTransformerEncoder, PixelWiseDotProduct
+class mViT(nn.Module):
+    def __init__(self, in_channels, n_query_channels=128, patch_size=16, dim_out=256,
+                 embedding_dim=128, num_heads=4, norm='linear'):
+        super(mViT, self).__init__()
+        self.norm = norm
+        self.n_query_channels = n_query_channels
+        self.patch_transformer = PatchTransformerEncoder(in_channels, patch_size, embedding_dim, num_heads)
+        self.dot_product_layer = PixelWiseDotProduct()
+        self.conv3x3 = nn.Conv2d(in_channels, embedding_dim, kernel_size=3, stride=1, padding=1)
+        self.regressor = nn.Sequential(nn.Linear(embedding_dim, 256),
+                                       nn.LeakyReLU(),
+                                       nn.Linear(256, 256),
+                                       nn.LeakyReLU(),
+                                       nn.Linear(256, dim_out))
+    def forward(self, x):
+        # n, c, h, w = x.size()
+        tgt = self.patch_transformer(x.clone())  # .shape = S, N, E
+        x = self.conv3x3(x)
+        regression_head, queries = tgt[0, ...], tgt[1:self.n_query_channels + 1, ...]
+        # Change from S, N, E to N, S, E
+        queries = queries.permute(1, 0, 2)
+        range_attention_maps = self.dot_product_layer(x, queries)  # .shape = n, n_query_channels, h, w
+        y = self.regressor(regression_head)  # .shape = N, dim_out
+        if self.norm == 'linear':
+            y = torch.relu(y)
+            eps = 0.1
+            y = y + eps
+        elif self.norm == 'softmax':
+            return torch.softmax(y, dim=1), range_attention_maps
+        else:
+            y = torch.sigmoid(y)
+        y = y / y.sum(dim=1, keepdim=True)
+        return y, range_attention_maps

depth/models_depth/model.py ADDED Viewed

	@@ -0,0 +1,666 @@

+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# The deconvolution code is based on Simple Baseline.
+# (https://github.com/microsoft/human-pose-estimation.pytorch/blob/master/lib/models/pose_resnet.py)
+# Modified by Zigang Geng (zigang@mail.ustc.edu.cn).
+# ------------------------------------------------------------------------------
+import torch
+import torch.nn as nn
+from timm.models.layers import trunc_normal_, DropPath
+from mmcv.cnn import (build_conv_layer, build_norm_layer, build_upsample_layer,
+                      constant_init, normal_init)
+from omegaconf import OmegaConf
+from ldm.util import instantiate_from_config
+import torch.nn.functional as F
+from evp.models import UNetWrapper, TextAdapterRefer, FrozenCLIPEmbedder
+from .miniViT import mViT
+from .attractor import AttractorLayer, AttractorLayerUnnormed
+from .dist_layers import ConditionalLogBinomial
+from .localbins_layers import (Projector, SeedBinRegressor, SeedBinRegressorUnnormed)
+import os
+def icnr(x, scale=2, init=nn.init.kaiming_normal_):
+    """
+    Checkerboard artifact free sub-pixel convolution
+    https://arxiv.org/abs/1707.02937
+    """
+    ni,nf,h,w = x.shape
+    ni2 = int(ni/(scale**2))
+    k = init(torch.zeros([ni2,nf,h,w])).transpose(0, 1)
+    k = k.contiguous().view(ni2, nf, -1)
+    k = k.repeat(1, 1, scale**2)
+    k = k.contiguous().view([nf,ni,h,w]).transpose(0, 1)
+    x.data.copy_(k)
+class PixelShuffle(nn.Module):
+    """
+    Real-Time Single Image and Video Super-Resolution
+    https://arxiv.org/abs/1609.05158
+    """
+    def __init__(self, n_channels, scale):
+        super(PixelShuffle, self).__init__()
+        self.conv = nn.Conv2d(n_channels, n_channels*(scale**2), kernel_size=1)
+        icnr(self.conv.weight)
+        self.shuf = nn.PixelShuffle(scale)
+        self.relu = nn.ReLU()
+    def forward(self,x):
+        x = self.shuf(self.relu(self.conv(x)))
+        return x
+class AttentionModule(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(AttentionModule, self).__init__()
+        # Convolutional Layers
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        # Group Normalization
+        self.group_norm = nn.GroupNorm(20, out_channels)
+        # ReLU Activation
+        self.relu = nn.ReLU()
+        # Spatial Attention
+        self.spatial_attention = nn.Sequential(
+            nn.Conv2d(in_channels, 1, kernel_size=1),
+            nn.Sigmoid()
+        )
+    def forward(self, x):
+        # Apply spatial attention
+        spatial_attention = self.spatial_attention(x)
+        x = x * spatial_attention
+        # Apply convolutional layer
+        x = self.conv1(x)
+        x = self.group_norm(x)
+        x = self.relu(x)
+        return x
+class AttentionDownsamplingModule(nn.Module):
+    def __init__(self, in_channels, out_channels, scale_factor=2):
+        super(AttentionDownsamplingModule, self).__init__()
+        # Spatial Attention
+        self.spatial_attention = nn.Sequential(
+            nn.Conv2d(in_channels, 1, kernel_size=1),
+            nn.Sigmoid()
+        )
+        # Channel Attention
+        self.channel_attention = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(in_channels, in_channels // 8, kernel_size=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(in_channels // 8, in_channels, kernel_size=1),
+            nn.Sigmoid()
+        )
+        # Convolutional Layers
+        if scale_factor == 2:
+            self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        elif scale_factor == 4:
+            self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=2, padding=1)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=2, padding=1)
+        # Group Normalization
+        self.group_norm = nn.GroupNorm(20, out_channels)
+        # ReLU Activation
+        self.relu = nn.ReLU(inplace=True)
+    def forward(self, x):
+        # Apply spatial attention
+        spatial_attention = self.spatial_attention(x)
+        x = x * spatial_attention
+        # Apply channel attention
+        channel_attention = self.channel_attention(x)
+        x = x * channel_attention
+        # Apply convolutional layers
+        x = self.conv1(x)
+        x = self.group_norm(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.group_norm(x)
+        x = self.relu(x)
+        return x
+class AttentionUpsamplingModule(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(AttentionUpsamplingModule, self).__init__()
+        # Spatial Attention for outs[2]
+        self.spatial_attention = nn.Sequential(
+            nn.Conv2d(in_channels, 1, kernel_size=1),
+            nn.Sigmoid()
+        )
+        # Channel Attention for outs[2]
+        self.channel_attention = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(in_channels, in_channels // 8, kernel_size=1),
+            nn.ReLU(),
+            nn.Conv2d(in_channels // 8, in_channels, kernel_size=1),
+            nn.Sigmoid()
+        )
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        # Group Normalization
+        self.group_norm = nn.GroupNorm(20, out_channels)
+        # ReLU Activation
+        self.relu = nn.ReLU()
+        self.upscale = PixelShuffle(in_channels, 2)
+    def forward(self, x):
+        # Apply spatial attention
+        spatial_attention = self.spatial_attention(x)
+        x = x * spatial_attention
+        # Apply channel attention
+        channel_attention = self.channel_attention(x)
+        x = x * channel_attention
+        # Apply convolutional layers
+        x = self.conv1(x)
+        x = self.group_norm(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.group_norm(x)
+        x = self.relu(x)
+        # Upsample
+        x = self.upscale(x)
+        return x
+class ConvLayer(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(ConvLayer, self).__init__()
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(in_channels, out_channels, 1),
+            nn.GroupNorm(20, out_channels),
+            nn.ReLU(),
+        )
+    def forward(self, x):
+        x = self.conv1(x)
+        return x
+class InverseMultiAttentiveFeatureRefinement(nn.Module):
+    def __init__(self, in_channels_list):
+        super(InverseMultiAttentiveFeatureRefinement, self).__init__()
+        self.layer1 = AttentionModule(in_channels_list[0], in_channels_list[0])
+        self.layer2 = AttentionDownsamplingModule(in_channels_list[0], in_channels_list[0]//2, scale_factor = 2)
+        self.layer3 = ConvLayer(in_channels_list[0]//2 + in_channels_list[1], in_channels_list[1])
+        self.layer4 = AttentionDownsamplingModule(in_channels_list[1], in_channels_list[1]//2, scale_factor = 2)
+        self.layer5 = ConvLayer(in_channels_list[1]//2 + in_channels_list[2], in_channels_list[2])
+        self.layer6 = AttentionDownsamplingModule(in_channels_list[2], in_channels_list[2]//2, scale_factor = 2)
+        self.layer7 = ConvLayer(in_channels_list[2]//2 + in_channels_list[3], in_channels_list[3])
+        '''
+        self.layer8 = AttentionUpsamplingModule(in_channels_list[3], in_channels_list[3])
+        self.layer9 = ConvLayer(in_channels_list[2] + in_channels_list[3], in_channels_list[2])
+        self.layer10 = AttentionUpsamplingModule(in_channels_list[2], in_channels_list[2])
+        self.layer11 = ConvLayer(in_channels_list[1] + in_channels_list[2], in_channels_list[1])
+        self.layer12 = AttentionUpsamplingModule(in_channels_list[1], in_channels_list[1])
+        self.layer13 = ConvLayer(in_channels_list[0] + in_channels_list[1], in_channels_list[0])
+        '''
+    def forward(self, inputs):
+        x_c4, x_c3, x_c2, x_c1 = inputs
+        x_c4 = self.layer1(x_c4)
+        x_c4_3 = self.layer2(x_c4)
+        x_c3 = torch.cat([x_c4_3, x_c3], dim=1)
+        x_c3 = self.layer3(x_c3)
+        x_c3_2 = self.layer4(x_c3)
+        x_c2 = torch.cat([x_c3_2, x_c2], dim=1)
+        x_c2 = self.layer5(x_c2)
+        x_c2_1 = self.layer6(x_c2)
+        x_c1 = torch.cat([x_c2_1, x_c1], dim=1)
+        x_c1 = self.layer7(x_c1)
+        '''
+        x_c1_2 = self.layer8(x_c1)
+        x_c2 = torch.cat([x_c1_2, x_c2], dim=1)
+        x_c2 = self.layer9(x_c2)
+        x_c2_3 = self.layer10(x_c2)
+        x_c3 = torch.cat([x_c2_3, x_c3], dim=1)
+        x_c3 = self.layer11(x_c3)
+        x_c3_4 = self.layer12(x_c3)
+        x_c4 = torch.cat([x_c3_4, x_c4], dim=1)
+        x_c4 = self.layer13(x_c4)
+        '''
+        return [x_c4, x_c3, x_c2, x_c1]
+class EVPDepthEncoder(nn.Module):
+    def __init__(self, out_dim=1024, ldm_prior=[320, 680, 1320+1280], sd_path=None, text_dim=768,
+                 dataset='nyu', caption_aggregation=False
+                ):
+        super().__init__()
+        self.layer1 = nn.Sequential(
+            nn.Conv2d(ldm_prior[0], ldm_prior[0], 3, stride=2, padding=1),
+            nn.GroupNorm(16, ldm_prior[0]),
+            nn.ReLU(),
+            nn.Conv2d(ldm_prior[0], ldm_prior[0], 3, stride=2, padding=1),
+        )
+        self.layer2 = nn.Sequential(
+            nn.Conv2d(ldm_prior[1], ldm_prior[1], 3, stride=2, padding=1),
+        )
+        self.out_layer = nn.Sequential(
+            nn.Conv2d(sum(ldm_prior), out_dim, 1),
+            nn.GroupNorm(16, out_dim),
+            nn.ReLU(),
+        )
+        self.aggregation = InverseMultiAttentiveFeatureRefinement([320, 680, 1320, 1280])
+        self.apply(self._init_weights)
+        ### stable diffusion layers
+        config = OmegaConf.load('./v1-inference.yaml')
+        if sd_path is None:
+            if os.path.exists('../checkpoints/v1-5-pruned-emaonly.ckpt'):
+                config.model.params.ckpt_path = '../checkpoints/v1-5-pruned-emaonly.ckpt'
+            else:
+                config.model.params.ckpt_path = None
+        else:
+            config.model.params.ckpt_path = f'../{sd_path}'
+        sd_model = instantiate_from_config(config.model)
+        self.encoder_vq = sd_model.first_stage_model
+        self.unet = UNetWrapper(sd_model.model, use_attn=True)
+        if dataset == 'kitti':
+            self.unet = UNetWrapper(sd_model.model, use_attn=True, base_size=384)
+        del sd_model.cond_stage_model
+        del self.encoder_vq.decoder
+        del self.unet.unet.diffusion_model.out
+        del self.encoder_vq.post_quant_conv.weight
+        del self.encoder_vq.post_quant_conv.bias
+        for param in self.encoder_vq.parameters():
+            param.requires_grad = True
+        self.text_adapter = TextAdapterRefer(text_dim=text_dim)
+        self.gamma = nn.Parameter(torch.ones(text_dim) * 1e-4)
+        if caption_aggregation:
+            class_embeddings = torch.load(f'{dataset}_class_embeddings_my_captions.pth')
+            #class_embeddings_list = [value['class_embeddings'] for key, value in class_embeddings.items()]
+            #stacked_embeddings = torch.stack(class_embeddings_list, dim=0)
+            #class_embeddings = torch.mean(stacked_embeddings, dim=0).unsqueeze(0)
+            if 'aggregated' in class_embeddings:
+                class_embeddings = class_embeddings['aggregated']
+            else:
+                clip_model = FrozenCLIPEmbedder(max_length=40,pool=False).cuda()
+                class_embeddings_new = [clip_model.encode(value['caption'][0]) for key, value in class_embeddings.items()]
+                class_embeddings_new = torch.mean(torch.stack(class_embeddings_new, dim=0), dim=0)
+                class_embeddings['aggregated'] = class_embeddings_new
+                torch.save(class_embeddings, f'{dataset}_class_embeddings_my_captions.pth')
+                class_embeddings = class_embeddings['aggregated']
+            self.register_buffer('class_embeddings', class_embeddings)
+        else:
+            self.class_embeddings = torch.load(f'{dataset}_class_embeddings_my_captions.pth')
+            self.clip_model = FrozenCLIPEmbedder(max_length=40,pool=False)
+            for param in self.clip_model.parameters():
+                param.requires_grad = True
+        #if dataset == 'kitti':
+        #    self.text_adapter_ = TextAdapterRefer(text_dim=text_dim)
+        #    self.gamma_ = nn.Parameter(torch.ones(text_dim) * 1e-4)
+        self.caption_aggregation = caption_aggregation
+        self.dataset = dataset
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv2d, nn.Linear)):
+            trunc_normal_(m.weight, std=.02)
+            nn.init.constant_(m.bias, 0)
+    def forward_features(self, feats):
+        x =  self.ldm_to_net[0](feats[0])
+        for i in range(3):
+            if i > 0:
+                x = x + self.ldm_to_net[i](feats[i])
+            x = self.layers[i](x)
+            x = self.upsample_layers[i](x)
+        return self.out_conv(x)
+    def forward(self, x, class_ids=None, img_paths=None):
+        latents = self.encoder_vq.encode(x).mode()
+        # add division by std
+        if self.dataset == 'nyu':
+            latents = latents / 5.07543
+        elif self.dataset == 'kitti':
+            latents = latents / 4.6211
+        else:
+            print('Please calculate the STD for the dataset!')
+        if class_ids is not None:
+            if self.caption_aggregation:
+                class_embeddings = self.class_embeddings[[0]*len(class_ids.tolist())]#[class_ids.tolist()]
+            else:
+                class_embeddings = []
+                for img_path in img_paths:
+                    class_embeddings.extend([value['caption'][0] for key, value in self.class_embeddings.items() if key in img_path.replace('//', '/')])
+                class_embeddings = self.clip_model.encode(class_embeddings)
+        else:
+            class_embeddings = self.class_embeddings
+        c_crossattn = self.text_adapter(latents, class_embeddings, self.gamma)
+        t = torch.ones((x.shape[0],), device=x.device).long()
+        #if self.dataset == 'kitti':
+        #    c_crossattn_last = self.text_adapter_(latents, class_embeddings, self.gamma_)
+        #    outs = self.unet(latents, t, c_crossattn=[c_crossattn, c_crossattn_last])
+        #else:
+        outs = self.unet(latents, t, c_crossattn=[c_crossattn])
+        outs = self.aggregation(outs)
+        feats = [outs[0], outs[1], torch.cat([outs[2], F.interpolate(outs[3], scale_factor=2)], dim=1)]
+        x = torch.cat([self.layer1(feats[0]), self.layer2(feats[1]), feats[2]], dim=1)
+        return self.out_layer(x)
+    def get_latent(self, x):
+        return self.encoder_vq.encode(x).mode()
+class EVPDepth(nn.Module):
+    def __init__(self, args=None, caption_aggregation=False):
+        super().__init__()
+        self.max_depth = args.max_depth
+        self.min_depth = args.min_depth_eval
+        embed_dim = 192
+        channels_in = embed_dim*8
+        channels_out = embed_dim
+        if args.dataset == 'nyudepthv2':
+            self.encoder = EVPDepthEncoder(out_dim=channels_in, dataset='nyu', caption_aggregation=caption_aggregation)
+        else:
+            self.encoder = EVPDepthEncoder(out_dim=channels_in, dataset='kitti', caption_aggregation=caption_aggregation)
+        self.decoder = Decoder(channels_in, channels_out, args)
+        self.decoder.init_weights()
+        self.mViT = False
+        self.custom = False
+        if not self.mViT and not self.custom:
+            n_bins = 64
+            bin_embedding_dim = 128
+            num_out_features = [32, 32, 32, 192]
+            min_temp = 0.0212
+            max_temp = 50
+            btlnck_features = 256
+            n_attractors = [16, 8, 4, 1]
+            attractor_alpha = 1000
+            attractor_gamma = 2
+            attractor_kind = "mean"
+            attractor_type = "inv"
+            self.bin_centers_type = "softplus"
+            self.bottle_neck = nn.Sequential(
+                nn.Conv2d(channels_in, btlnck_features, kernel_size=3, stride=1, padding=1),
+                nn.ReLU(inplace=False),
+                nn.Conv2d(btlnck_features, btlnck_features, kernel_size=3, stride=1, padding=1))
+            for m in self.bottle_neck.modules():
+                if isinstance(m, nn.Conv2d):
+                    normal_init(m, std=0.001, bias=0)
+            SeedBinRegressorLayer = SeedBinRegressorUnnormed
+            Attractor = AttractorLayerUnnormed
+            self.seed_bin_regressor = SeedBinRegressorLayer(
+                btlnck_features, n_bins=n_bins, min_depth=self.min_depth, max_depth=self.max_depth)
+            self.seed_projector = Projector(btlnck_features, bin_embedding_dim)
+            self.projectors = nn.ModuleList([
+                Projector(num_out, bin_embedding_dim)
+                for num_out in num_out_features
+            ])
+            self.attractors = nn.ModuleList([
+                Attractor(bin_embedding_dim, n_bins, n_attractors=n_attractors[i], min_depth=self.min_depth, max_depth=self.max_depth,
+                          alpha=attractor_alpha, gamma=attractor_gamma, kind=attractor_kind, attractor_type=attractor_type)
+                for i in range(len(num_out_features))
+            ])
+            last_in = 192 + 1
+            self.conditional_log_binomial = ConditionalLogBinomial(
+                last_in, bin_embedding_dim, n_classes=n_bins, min_temp=min_temp, max_temp=max_temp)
+        elif self.mViT and not self.custom:
+            n_bins = 256
+            self.adaptive_bins_layer = mViT(192, n_query_channels=192, patch_size=16,
+                                            dim_out=n_bins,
+                                            embedding_dim=192, norm='linear')
+            self.conv_out = nn.Sequential(nn.Conv2d(192, n_bins, kernel_size=1, stride=1, padding=0),
+                                          nn.Softmax(dim=1))
+    def forward(self, x, class_ids=None, img_paths=None):
+        b, c, h, w = x.shape
+        x = x*2.0 - 1.0  # normalize to [-1, 1]
+        if h == 480 and w == 480:
+            new_x = torch.zeros(b, c, 512, 512, device=x.device)
+            new_x[:, :, 0:480, 0:480] = x
+            x = new_x
+        elif h==352 and w==352:
+            new_x = torch.zeros(b, c, 384, 384, device=x.device)
+            new_x[:, :, 0:352, 0:352] = x
+            x = new_x
+        elif h == 512 and w == 512:
+            pass
+        else:
+            print(h,w)
+            raise NotImplementedError
+        conv_feats = self.encoder(x, class_ids, img_paths)
+        if h == 480 or h == 352:
+            conv_feats = conv_feats[:, :, :-1, :-1]
+        self.decoder.remove_hooks()
+        out_depth, out, x_blocks = self.decoder([conv_feats])
+        if not self.mViT and not self.custom:
+            x = self.bottle_neck(conv_feats)
+            _, seed_b_centers = self.seed_bin_regressor(x)
+            if self.bin_centers_type == 'normed' or self.bin_centers_type == 'hybrid2':
+                b_prev = (seed_b_centers - self.min_depth) / \
+                    (self.max_depth - self.min_depth)
+            else:
+                b_prev = seed_b_centers
+            prev_b_embedding = self.seed_projector(x)
+            for projector, attractor, x in zip(self.projectors, self.attractors, x_blocks):
+                b_embedding = projector(x)
+                b, b_centers = attractor(
+                    b_embedding, b_prev, prev_b_embedding, interpolate=True)
+                b_prev = b.clone()
+                prev_b_embedding = b_embedding.clone()
+            rel_cond = torch.sigmoid(out_depth) * self.max_depth
+            # concat rel depth with last. First interpolate rel depth to last size
+            rel_cond = nn.functional.interpolate(
+                rel_cond, size=out.shape[2:], mode='bilinear', align_corners=True)
+            last = torch.cat([out, rel_cond], dim=1)
+            b_embedding = nn.functional.interpolate(
+                b_embedding, last.shape[-2:], mode='bilinear', align_corners=True)
+            x = self.conditional_log_binomial(last, b_embedding)
+            # Now depth value is Sum px * cx , where cx are bin_centers from the last bin tensor
+            b_centers = nn.functional.interpolate(
+                b_centers, x.shape[-2:], mode='bilinear', align_corners=True)
+            out_depth = torch.sum(x * b_centers, dim=1, keepdim=True)
+        elif self.mViT and not self.custom:
+            bin_widths_normed, range_attention_maps = self.adaptive_bins_layer(out)
+            out = self.conv_out(range_attention_maps)
+            bin_widths = (self.max_depth - self.min_depth) * bin_widths_normed  # .shape = N, dim_out
+            bin_widths = nn.functional.pad(bin_widths, (1, 0), mode='constant', value=self.min_depth)
+            bin_edges = torch.cumsum(bin_widths, dim=1)
+            centers = 0.5 * (bin_edges[:, :-1] + bin_edges[:, 1:])
+            n, dout = centers.size()
+            centers = centers.view(n, dout, 1, 1)
+            out_depth = torch.sum(out * centers, dim=1, keepdim=True)
+        else:
+            out_depth = torch.sigmoid(out_depth) * self.max_depth
+        return {'pred_d': out_depth}
+class Decoder(nn.Module):
+    def __init__(self, in_channels, out_channels, args):
+        super().__init__()
+        self.deconv = args.num_deconv
+        self.in_channels = in_channels
+        embed_dim = 192
+        channels_in = embed_dim*8
+        channels_out = embed_dim
+        self.deconv_layers, self.intermediate_results = self._make_deconv_layer(
+            args.num_deconv,
+            args.num_filters,
+            args.deconv_kernels,
+        )
+        self.last_layer_depth = nn.Sequential(
+            nn.Conv2d(channels_out, channels_out, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(inplace=False),
+            nn.Conv2d(channels_out, 1, kernel_size=3, stride=1, padding=1))
+        for m in self.last_layer_depth.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001, bias=0)
+        conv_layers = []
+        conv_layers.append(
+            build_conv_layer(
+                dict(type='Conv2d'),
+                in_channels=args.num_filters[-1],
+                out_channels=out_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1))
+        conv_layers.append(
+            build_norm_layer(dict(type='BN'), out_channels)[1])
+        conv_layers.append(nn.ReLU())
+        self.conv_layers = nn.Sequential(*conv_layers)
+        self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False)
+    def forward(self, conv_feats):
+        out = self.deconv_layers(conv_feats[0])
+        out = self.conv_layers(out)
+        out = self.up(out)
+        self.intermediate_results.append(out)
+        out = self.up(out)
+        out_depth = self.last_layer_depth(out)
+        return out_depth, out, self.intermediate_results
+    def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
+        """Make deconv layers."""
+        layers = []
+        in_planes = self.in_channels
+        intermediate_results = []  # List to store intermediate feature maps
+        for i in range(num_layers):
+            kernel, padding, output_padding = \
+                self._get_deconv_cfg(num_kernels[i])
+            planes = num_filters[i]
+            layers.append(
+                build_upsample_layer(
+                    dict(type='deconv'),
+                    in_channels=in_planes,
+                    out_channels=planes,
+                    kernel_size=kernel,
+                    stride=2,
+                    padding=padding,
+                    output_padding=output_padding,
+                    bias=False))
+            layers.append(nn.BatchNorm2d(planes))
+            layers.append(nn.ReLU())
+            in_planes = planes
+            # Add a hook to store the intermediate result
+            layers[-1].register_forward_hook(self._hook_fn(intermediate_results))
+        return nn.Sequential(*layers), intermediate_results
+    def _hook_fn(self, intermediate_results):
+        def hook(module, input, output):
+            intermediate_results.append(output)
+        return hook
+    def remove_hooks(self):
+        self.intermediate_results.clear()
+    def _get_deconv_cfg(self, deconv_kernel):
+        """Get configurations for deconv layers."""
+        if deconv_kernel == 4:
+            padding = 1
+            output_padding = 0
+        elif deconv_kernel == 3:
+            padding = 1
+            output_padding = 1
+        elif deconv_kernel == 2:
+            padding = 0
+            output_padding = 0
+        else:
+            raise ValueError(f'Not supported num_kernels ({deconv_kernel}).')
+        return deconv_kernel, padding, output_padding
+    def init_weights(self):
+        """Initialize model weights."""
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001, bias=0)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
+            elif isinstance(m, nn.ConvTranspose2d):
+                normal_init(m, std=0.001)

depth/models_depth/model_vpd.py ADDED Viewed

	@@ -0,0 +1,252 @@

+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# The deconvolution code is based on Simple Baseline.
+# (https://github.com/microsoft/human-pose-estimation.pytorch/blob/master/lib/models/pose_resnet.py)
+# Modified by Zigang Geng (zigang@mail.ustc.edu.cn).
+# ------------------------------------------------------------------------------
+import torch
+import torch.nn as nn
+from timm.models.layers import trunc_normal_, DropPath
+from mmcv.cnn import (build_conv_layer, build_norm_layer, build_upsample_layer,
+                      constant_init, normal_init)
+from omegaconf import OmegaConf
+from ldm.util import instantiate_from_config
+import torch.nn.functional as F
+from evp.models import UNetWrapper, TextAdapterDepth
+class VPDDepthEncoder(nn.Module):
+    def __init__(self, out_dim=1024, ldm_prior=[320, 640, 1280+1280], sd_path=None, text_dim=768,
+                 dataset='nyu'
+                 ):
+        super().__init__()
+        self.layer1 = nn.Sequential(
+            nn.Conv2d(ldm_prior[0], ldm_prior[0], 3, stride=2, padding=1),
+            nn.GroupNorm(16, ldm_prior[0]),
+            nn.ReLU(),
+            nn.Conv2d(ldm_prior[0], ldm_prior[0], 3, stride=2, padding=1),
+        )
+        self.layer2 = nn.Sequential(
+            nn.Conv2d(ldm_prior[1], ldm_prior[1], 3, stride=2, padding=1),
+        )
+        self.out_layer = nn.Sequential(
+            nn.Conv2d(sum(ldm_prior), out_dim, 1),
+            nn.GroupNorm(16, out_dim),
+            nn.ReLU(),
+        )
+        self.apply(self._init_weights)
+        ### stable diffusion layers
+        config = OmegaConf.load('./v1-inference.yaml')
+        if sd_path is None:
+            config.model.params.ckpt_path = '../checkpoints/v1-5-pruned-emaonly.ckpt'
+        else:
+            config.model.params.ckpt_path = f'../{sd_path}'
+        sd_model = instantiate_from_config(config.model)
+        self.encoder_vq = sd_model.first_stage_model
+        self.unet = UNetWrapper(sd_model.model, use_attn=False)
+        del sd_model.cond_stage_model
+        del self.encoder_vq.decoder
+        del self.unet.unet.diffusion_model.out
+        for param in self.encoder_vq.parameters():
+            param.requires_grad = False
+        if dataset == 'nyu':
+            self.text_adapter = TextAdapterDepth(text_dim=text_dim)
+            class_embeddings = torch.load('nyu_class_embeddings.pth')
+        else:
+            raise NotImplementedError
+        self.register_buffer('class_embeddings', class_embeddings)
+        self.gamma = nn.Parameter(torch.ones(text_dim) * 1e-4)
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv2d, nn.Linear)):
+            trunc_normal_(m.weight, std=.02)
+            nn.init.constant_(m.bias, 0)
+    def forward_features(self, feats):
+        x =  self.ldm_to_net[0](feats[0])
+        for i in range(3):
+            if i > 0:
+                x = x + self.ldm_to_net[i](feats[i])
+            x = self.layers[i](x)
+            x = self.upsample_layers[i](x)
+        return self.out_conv(x)
+    def forward(self, x, class_ids=None,img_paths=None):
+        with torch.no_grad():
+            latents = self.encoder_vq.encode(x).mode().detach()
+        if class_ids is not None:
+            class_embeddings = self.class_embeddings[class_ids.tolist()]
+        else:
+            class_embeddings = self.class_embeddings
+        c_crossattn = self.text_adapter(latents, class_embeddings, self.gamma)  # NOTE: here the c_crossattn should be expand_dim as latents
+        t = torch.ones((x.shape[0],), device=x.device).long()
+        # import pdb; pdb.set_trace()
+        outs = self.unet(latents, t, c_crossattn=[c_crossattn])
+        feats = [outs[0], outs[1], torch.cat([outs[2], F.interpolate(outs[3], scale_factor=2)], dim=1)]
+        x = torch.cat([self.layer1(feats[0]), self.layer2(feats[1]), feats[2]], dim=1)
+        return self.out_layer(x)
+class VPDDepth(nn.Module):
+    def __init__(self, args=None):
+        super().__init__()
+        self.max_depth = args.max_depth
+        embed_dim = 192
+        channels_in = embed_dim*8
+        channels_out = embed_dim
+        if args.dataset == 'nyudepthv2':
+            self.encoder = VPDDepthEncoder(out_dim=channels_in, dataset='nyu')
+        else:
+            raise NotImplementedError
+        self.decoder = Decoder(channels_in, channels_out, args)
+        self.decoder.init_weights()
+        self.last_layer_depth = nn.Sequential(
+            nn.Conv2d(channels_out, channels_out, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(inplace=False),
+            nn.Conv2d(channels_out, 1, kernel_size=3, stride=1, padding=1))
+        for m in self.last_layer_depth.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001, bias=0)
+    def forward(self, x, class_ids=None,img_paths=None):
+        # import pdb; pdb.set_trace()
+        b, c, h, w = x.shape
+        x = x*2.0 - 1.0  # normalize to [-1, 1]
+        if h == 480 and w == 480:
+            new_x = torch.zeros(b, c, 512, 512, device=x.device)
+            new_x[:, :, 0:480, 0:480] = x
+            x = new_x
+        elif h==352 and w==352:
+            new_x = torch.zeros(b, c, 384, 384, device=x.device)
+            new_x[:, :, 0:352, 0:352] = x
+            x = new_x
+        elif h == 512 and w == 512:
+            pass
+        else:
+            raise NotImplementedError
+        conv_feats = self.encoder(x, class_ids)
+        if h == 480 or h == 352:
+            conv_feats = conv_feats[:, :, :-1, :-1]
+        out = self.decoder([conv_feats])
+        out_depth = self.last_layer_depth(out)
+        out_depth = torch.sigmoid(out_depth) * self.max_depth
+        return {'pred_d': out_depth}
+class Decoder(nn.Module):
+    def __init__(self, in_channels, out_channels, args):
+        super().__init__()
+        self.deconv = args.num_deconv
+        self.in_channels = in_channels
+        # import pdb; pdb.set_trace()
+        self.deconv_layers = self._make_deconv_layer(
+            args.num_deconv,
+            args.num_filters,
+            args.deconv_kernels,
+        )
+        conv_layers = []
+        conv_layers.append(
+            build_conv_layer(
+                dict(type='Conv2d'),
+                in_channels=args.num_filters[-1],
+                out_channels=out_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1))
+        conv_layers.append(
+            build_norm_layer(dict(type='BN'), out_channels)[1])
+        conv_layers.append(nn.ReLU(inplace=True))
+        self.conv_layers = nn.Sequential(*conv_layers)
+        self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False)
+    def forward(self, conv_feats):
+        # import pdb; pdb.set_trace()
+        out = self.deconv_layers(conv_feats[0])
+        out = self.conv_layers(out)
+        out = self.up(out)
+        out = self.up(out)
+        return out
+    def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
+        """Make deconv layers."""
+        layers = []
+        in_planes = self.in_channels
+        for i in range(num_layers):
+            kernel, padding, output_padding = \
+                self._get_deconv_cfg(num_kernels[i])
+            planes = num_filters[i]
+            layers.append(
+                build_upsample_layer(
+                    dict(type='deconv'),
+                    in_channels=in_planes,
+                    out_channels=planes,
+                    kernel_size=kernel,
+                    stride=2,
+                    padding=padding,
+                    output_padding=output_padding,
+                    bias=False))
+            layers.append(nn.BatchNorm2d(planes))
+            layers.append(nn.ReLU(inplace=True))
+            in_planes = planes
+        return nn.Sequential(*layers)
+    def _get_deconv_cfg(self, deconv_kernel):
+        """Get configurations for deconv layers."""
+        if deconv_kernel == 4:
+            padding = 1
+            output_padding = 0
+        elif deconv_kernel == 3:
+            padding = 1
+            output_padding = 1
+        elif deconv_kernel == 2:
+            padding = 0
+            output_padding = 0
+        else:
+            raise ValueError(f'Not supported num_kernels ({deconv_kernel}).')
+        return deconv_kernel, padding, output_padding
+    def init_weights(self):
+        """Initialize model weights."""
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001, bias=0)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
+            elif isinstance(m, nn.ConvTranspose2d):
+                normal_init(m, std=0.001)

depth/models_depth/optimizer.py ADDED Viewed

	@@ -0,0 +1,154 @@

+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# The code is from SimMIM.
+# (https://github.com/microsoft/SimMIM)
+# ------------------------------------------------------------------------------
+import json
+from mmcv.runner import OPTIMIZER_BUILDERS, DefaultOptimizerConstructor
+from mmcv.runner import build_optimizer
+from mmcv.runner import get_dist_info
+def get_num_layer_for_swin(var_name, num_max_layer, layers_per_stage):
+    var_name = var_name.replace('encoder', 'backbone') if var_name.startswith('encoder') else var_name
+    if var_name in ("backbone.cls_token", "backbone.mask_token",
+                    "backbone.pos_embed", "backbone.absolute_pos_embed"):
+        return 0
+    elif var_name.startswith("backbone.patch_embed"):
+        return 0
+    elif var_name.startswith("backbone.layers"):
+        if var_name.split('.')[3] == "blocks":
+            stage_id = int(var_name.split('.')[2])
+            layer_id = int(var_name.split('.')[4]) \
+                       + sum(layers_per_stage[:stage_id])
+            return layer_id + 1
+        elif var_name.split('.')[3] == "downsample":
+            stage_id = int(var_name.split('.')[2])
+            layer_id = sum(layers_per_stage[:stage_id + 1])
+            return layer_id
+    else:
+        return num_max_layer - 1
+@OPTIMIZER_BUILDERS.register_module()
+class LDMOptimizerConstructor(DefaultOptimizerConstructor):
+    def add_params(self, params, module, prefix='', is_dcn_module=None):
+        """Add all parameters of module to the params list.
+        The parameters of the given module will be added to the list of param
+        groups, with specific rules defined by paramwise_cfg.
+        Args:
+            params (list[dict]): A list of param groups, it will be modified
+                in place.
+            module (nn.Module): The module to be added.
+            prefix (str): The prefix of the module
+            is_dcn_module (int|float|None): If the current module is a
+                submodule of DCN, `is_dcn_module` will be passed to
+                control conv_offset layer's learning rate. Defaults to None.
+        """
+        parameter_groups = {}
+        no_decay_names = self.paramwise_cfg.get('no_decay_names', [])
+        print("Build LDMOptimizerConstructor")
+        weight_decay = self.base_wd
+        for name, param in module.named_parameters():
+            if not param.requires_grad:
+                continue  # frozen weights
+            if len(param.shape) == 1 or name.endswith(".bias") or name in ('absolute_pos_embed'):
+                group_name = "no_decay"
+                this_weight_decay = 0.
+            else:
+                group_name = "decay"
+                this_weight_decay = weight_decay
+                for nd_name in no_decay_names:
+                    if nd_name in name:
+                        group_name = "no_decay"
+                        this_weight_decay = 0.
+                        break
+            if 'unet' in name or 'cond_stage_model' in name or 'encoder_vq' in name or 'clip_model' in name:
+                layer_id = 0
+            else:
+                layer_id = 1
+            group_name = "layer_%d_%s" % (layer_id, group_name)
+            if group_name not in parameter_groups:
+                if layer_id == 0:
+                    scale = 0.01
+                else:
+                    scale = 1.0
+                parameter_groups[group_name] = {
+                    "weight_decay": this_weight_decay,
+                    "params": [],
+                    "param_names": [],
+                    "lr_scale": scale,
+                    "group_name": group_name,
+                    "lr": scale * self.base_lr,
+                }
+            parameter_groups[group_name]["params"].append(param)
+            parameter_groups[group_name]["param_names"].append(name)
+        rank, _ = get_dist_info()
+        if rank == 0:
+            to_display = {}
+            for key in parameter_groups:
+                to_display[key] = {
+                    "param_names": parameter_groups[key]["param_names"],
+                    "lr_scale": parameter_groups[key]["lr_scale"],
+                    "lr": parameter_groups[key]["lr"],
+                    "weight_decay": parameter_groups[key]["weight_decay"],
+                }
+        params.extend(parameter_groups.values())
+def build_optimizers(model, cfgs):
+    """Build multiple optimizers from configs.
+    If `cfgs` contains several dicts for optimizers, then a dict for each
+    constructed optimizers will be returned.
+    If `cfgs` only contains one optimizer config, the constructed optimizer
+    itself will be returned.
+    For example,
+    1) Multiple optimizer configs:
+    .. code-block:: python
+        optimizer_cfg = dict(
+            model1=dict(type='SGD', lr=lr),
+            model2=dict(type='SGD', lr=lr))
+    The return dict is
+    ``dict('model1': torch.optim.Optimizer, 'model2': torch.optim.Optimizer)``
+    2) Single optimizer config:
+    .. code-block:: python
+        optimizer_cfg = dict(type='SGD', lr=lr)
+    The return is ``torch.optim.Optimizer``.
+    Args:
+        model (:obj:`nn.Module`): The model with parameters to be optimized.
+        cfgs (dict): The config dict of the optimizer.
+    Returns:
+        dict[:obj:`torch.optim.Optimizer`] | :obj:`torch.optim.Optimizer`:
+            The initialized optimizers.
+    """
+    optimizers = {}
+    if hasattr(model, 'module'):
+        model = model.module
+    # determine whether 'cfgs' has several dicts for optimizers
+    if all(isinstance(v, dict) for v in cfgs.values()):
+        for key, cfg in cfgs.items():
+            cfg_ = cfg.copy()
+            module = getattr(model, key)
+            optimizers[key] = build_optimizer(module, cfg_)
+        return optimizers
+    return build_optimizer(model, cfgs)

depth/requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+torch>=1.6.0
+h5py>=3.6.0
+scipy>=1.7.3
+opencv-python>=4.5.5
+timm>=0.5.4
+albumentations>=1.1.0
+tensorboardX>=2.4.1
+gdown>=4.2.1

depth/test_img.jpg ADDED Viewed

depth/utils.py ADDED Viewed

	@@ -0,0 +1,525 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import math
+import time
+from collections import defaultdict, deque
+import datetime
+import numpy as np
+from timm.utils import get_state_dict
+from pathlib import Path
+import torch
+import torch.distributed as dist
+from torch._six import inf
+from tensorboardX import SummaryWriter
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_avail_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
+        dist.barrier()
+        dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+    @property
+    def global_avg(self):
+        return self.total / self.count
+    @property
+    def max(self):
+        return max(self.deque)
+    @property
+    def value(self):
+        return self.deque[-1]
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value)
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if v is None:
+                continue
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append(
+                "{}: {}".format(name, str(meter))
+            )
+        return self.delimiter.join(loss_str)
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+    def log_every(self, iterable, print_freq, header=None):
+        i = 0
+        if not header:
+            header = ''
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt='{avg:.4f}')
+        data_time = SmoothedValue(fmt='{avg:.4f}')
+        space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
+        log_msg = [
+            header,
+            '[{0' + space_fmt + '}/{1}]',
+            'eta: {eta}',
+            '{meters}',
+            'time: {time}',
+            'data: {data}'
+        ]
+        if torch.cuda.is_available():
+            log_msg.append('max mem: {memory:.0f}')
+        log_msg = self.delimiter.join(log_msg)
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len(iterable) - 1:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time),
+                        memory=torch.cuda.max_memory_allocated() / MB))
+                else:
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time)))
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print('{} Total time: {} ({:.4f} s / it)'.format(
+            header, total_time_str, total_time / len(iterable)))
+class TensorboardLogger(object):
+    def __init__(self, log_dir):
+        self.writer = SummaryWriter(logdir=log_dir)
+        self.step = 0
+    def set_step(self, step=None):
+        if step is not None:
+            self.step = step
+        else:
+            self.step += 1
+    def update(self, head='scalar', step=None, **kwargs):
+        for k, v in kwargs.items():
+            if v is None:
+                continue
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.writer.add_scalar(head + "/" + k, v, self.step if step is None else step)
+    def flush(self):
+        self.writer.flush()
+class WandbLogger(object):
+    def __init__(self, args):
+        self.args = args
+        try:
+            import wandb
+            self._wandb = wandb
+        except ImportError:
+            raise ImportError(
+                "To use the Weights and Biases Logger please install wandb."
+                "Run `pip install wandb` to install it."
+            )
+        # Initialize a W&B run
+        if self._wandb.run is None:
+            self._wandb.init(
+                project=args.project,
+                config=args
+            )
+    def log_epoch_metrics(self, metrics, commit=True):
+        """
+        Log train/test metrics onto W&B.
+        """
+        # Log number of model parameters as W&B summary
+        self._wandb.summary['n_parameters'] = metrics.get('n_parameters', None)
+        metrics.pop('n_parameters', None)
+        # Log current epoch
+        self._wandb.log({'epoch': metrics.get('epoch')}, commit=False)
+        metrics.pop('epoch')
+        for k, v in metrics.items():
+            if 'train' in k:
+                self._wandb.log({f'Global Train/{k}': v}, commit=False)
+            elif 'test' in k:
+                self._wandb.log({f'Global Test/{k}': v}, commit=False)
+        self._wandb.log({})
+    def log_checkpoints(self):
+        output_dir = self.args.output_dir
+        model_artifact = self._wandb.Artifact(
+            self._wandb.run.id + "_model", type="model"
+        )
+        model_artifact.add_dir(output_dir)
+        self._wandb.log_artifact(model_artifact, aliases=["latest", "best"])
+    def set_steps(self):
+        # Set global training step
+        self._wandb.define_metric('Rank-0 Batch Wise/*', step_metric='Rank-0 Batch Wise/global_train_step')
+        # Set epoch-wise step
+        self._wandb.define_metric('Global Train/*', step_metric='epoch')
+        self._wandb.define_metric('Global Test/*', step_metric='epoch')
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+    __builtin__.print = print
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+def is_main_process():
+    return get_rank() == 0
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+def init_distributed_mode(args):
+    if args.dist_on_itp:
+        args.rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
+        args.world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
+        args.gpu = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
+        args.dist_url = "tcp://%s:%s" % (os.environ['MASTER_ADDR'], os.environ['MASTER_PORT'])
+        os.environ['LOCAL_RANK'] = str(args.gpu)
+        os.environ['RANK'] = str(args.rank)
+        os.environ['WORLD_SIZE'] = str(args.world_size)
+        # ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "LOCAL_RANK"]
+    elif 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ['WORLD_SIZE'])
+        args.gpu = int(os.environ['LOCAL_RANK'])
+    elif 'SLURM_PROCID' in os.environ:
+        args.rank = int(os.environ['SLURM_PROCID'])
+        args.gpu = args.rank % torch.cuda.device_count()
+        os.environ['RANK'] = str(args.rank)
+        os.environ['LOCAL_RANK'] = str(args.gpu)
+        os.environ['WORLD_SIZE'] = str(args.world_size)
+    else:
+        print('Not using distributed mode')
+        args.distributed = False
+        return
+    args.distributed = True
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = 'nccl'
+    print('| distributed init (rank {}): {}, gpu {}'.format(
+        args.rank, args.dist_url, args.gpu), flush=True)
+    torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                         world_size=args.world_size, rank=args.rank)
+    torch.distributed.barrier()
+    setup_for_distributed(args.rank == 0)
+def init_distributed_mode_simple(args):
+    args.rank = int(os.environ["RANK"])
+    args.world_size = int(os.environ['WORLD_SIZE'])
+    args.gpu = int(os.environ['LOCAL_RANK'])
+    args.dist_url = 'env://'
+    args.distributed = True
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = 'nccl'
+    print('| distributed init (rank {}): {}, gpu {}'.format(
+        args.rank, args.dist_url, args.gpu), flush=True)
+    torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                         world_size=args.world_size, rank=args.rank)
+    torch.distributed.barrier()
+    setup_for_distributed(args.rank == 0)
+def load_state_dict(model, state_dict, prefix='', ignore_missing="relative_position_index"):
+    missing_keys = []
+    unexpected_keys = []
+    error_msgs = []
+    # copy state_dict so _load_from_state_dict can modify it
+    metadata = getattr(state_dict, '_metadata', None)
+    state_dict = state_dict.copy()
+    if metadata is not None:
+        state_dict._metadata = metadata
+    def load(module, prefix=''):
+        local_metadata = {} if metadata is None else metadata.get(
+            prefix[:-1], {})
+        module._load_from_state_dict(
+            state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
+        for name, child in module._modules.items():
+            if child is not None:
+                load(child, prefix + name + '.')
+    load(model, prefix=prefix)
+    warn_missing_keys = []
+    ignore_missing_keys = []
+    for key in missing_keys:
+        keep_flag = True
+        for ignore_key in ignore_missing.split('|'):
+            if ignore_key in key:
+                keep_flag = False
+                break
+        if keep_flag:
+            warn_missing_keys.append(key)
+        else:
+            ignore_missing_keys.append(key)
+    missing_keys = warn_missing_keys
+    if len(missing_keys) > 0:
+        print("Weights of {} not initialized from pretrained model: {}".format(
+            model.__class__.__name__, missing_keys))
+    if len(unexpected_keys) > 0:
+        print("Weights from pretrained model not used in {}: {}".format(
+            model.__class__.__name__, unexpected_keys))
+    if len(ignore_missing_keys) > 0:
+        print("Ignored weights of {} not initialized from pretrained model: {}".format(
+            model.__class__.__name__, ignore_missing_keys))
+    if len(error_msgs) > 0:
+        print('\n'.join(error_msgs))
+class NativeScalerWithGradNormCount:
+    state_dict_key = "amp_scaler"
+    def __init__(self):
+        self._scaler = torch.cuda.amp.GradScaler()
+    def __call__(self, loss, optimizer, clip_grad=None, parameters=None, create_graph=False, update_grad=True):
+        self._scaler.scale(loss).backward(create_graph=create_graph)
+        if update_grad:
+            if clip_grad is not None:
+                assert parameters is not None
+                self._scaler.unscale_(optimizer)  # unscale the gradients of optimizer's assigned params in-place
+                norm = torch.nn.utils.clip_grad_norm_(parameters, clip_grad)
+            else:
+                self._scaler.unscale_(optimizer)
+                norm = get_grad_norm_(parameters)
+            self._scaler.step(optimizer)
+            self._scaler.update()
+        else:
+            norm = None
+        return norm
+    def state_dict(self):
+        return self._scaler.state_dict()
+    def load_state_dict(self, state_dict):
+        self._scaler.load_state_dict(state_dict)
+def get_grad_norm_(parameters, norm_type: float = 2.0) -> torch.Tensor:
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = [p for p in parameters if p.grad is not None]
+    norm_type = float(norm_type)
+    if len(parameters) == 0:
+        return torch.tensor(0.)
+    device = parameters[0].grad.device
+    if norm_type == inf:
+        total_norm = max(p.grad.detach().abs().max().to(device) for p in parameters)
+    else:
+        total_norm = torch.norm(torch.stack([torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]), norm_type)
+    return total_norm
+def cosine_scheduler(base_value, final_value, epochs, niter_per_ep, warmup_epochs=0,
+                     start_warmup_value=0, warmup_steps=-1):
+    warmup_schedule = np.array([])
+    warmup_iters = warmup_epochs * niter_per_ep
+    if warmup_steps > 0:
+        warmup_iters = warmup_steps
+    print("Set warmup steps = %d" % warmup_iters)
+    if warmup_epochs > 0:
+        warmup_schedule = np.linspace(start_warmup_value, base_value, warmup_iters)
+    iters = np.arange(epochs * niter_per_ep - warmup_iters)
+    schedule = np.array(
+        [final_value + 0.5 * (base_value - final_value) * (1 + math.cos(math.pi * i / (len(iters)))) for i in iters])
+    schedule = np.concatenate((warmup_schedule, schedule))
+    assert len(schedule) == epochs * niter_per_ep
+    return schedule
+def save_model(args, epoch, model, model_without_ddp, optimizer, loss_scaler, model_ema=None):
+    output_dir = Path(args.output_dir)
+    epoch_name = str(epoch)
+    checkpoint_paths = [output_dir / ('checkpoint-%s.pth' % epoch_name)]
+    for checkpoint_path in checkpoint_paths:
+        to_save = {
+            'model': model_without_ddp.state_dict(),
+            'optimizer': optimizer.state_dict(),
+            'epoch': epoch,
+            'scaler': loss_scaler.state_dict(),
+            'args': args,
+        }
+        if model_ema is not None:
+            to_save['model_ema'] = get_state_dict(model_ema)
+        save_on_master(to_save, checkpoint_path)
+    if is_main_process() and isinstance(epoch, int):
+        to_del = epoch - args.save_ckpt_num * args.save_ckpt_freq
+        old_ckpt = output_dir / ('checkpoint-%s.pth' % to_del)
+        if os.path.exists(old_ckpt):
+            os.remove(old_ckpt)
+def auto_load_model(args, model, model_without_ddp, optimizer, loss_scaler, model_ema=None):
+    output_dir = Path(args.output_dir)
+    if args.auto_resume and len(args.resume) == 0:
+        import glob
+        all_checkpoints = glob.glob(os.path.join(output_dir, 'checkpoint-*.pth'))
+        latest_ckpt = -1
+        for ckpt in all_checkpoints:
+            t = ckpt.split('-')[-1].split('.')[0]
+            if t.isdigit():
+                latest_ckpt = max(int(t), latest_ckpt)
+        if latest_ckpt >= 0:
+            args.resume = os.path.join(output_dir, 'checkpoint-%d.pth' % latest_ckpt)
+        print("Auto resume checkpoint: %s" % args.resume)
+    if args.resume:
+        if args.resume.startswith('https'):
+            checkpoint = torch.hub.load_state_dict_from_url(
+                args.resume, map_location='cpu', check_hash=True)
+        else:
+            checkpoint = torch.load(args.resume, map_location='cpu')
+        model_without_ddp.load_state_dict(checkpoint['model'])
+        print("Resume checkpoint %s" % args.resume)
+        if 'optimizer' in checkpoint and 'epoch' in checkpoint:
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            if not isinstance(checkpoint['epoch'], str): # does not support resuming with 'best', 'best-ema'
+                args.start_epoch = checkpoint['epoch'] + 1
+            else:
+                assert args.eval, 'Does not support resuming with checkpoint-best'
+            if hasattr(args, 'model_ema') and args.model_ema:
+                if 'model_ema' in checkpoint.keys():
+                    model_ema.ema.load_state_dict(checkpoint['model_ema'])
+                else:
+                    model_ema.ema.load_state_dict(checkpoint['model'])
+            if 'scaler' in checkpoint:
+                loss_scaler.load_state_dict(checkpoint['scaler'])
+            print("With optim & sched!")

depth/utils_depth/criterion.py ADDED Viewed

	@@ -0,0 +1,22 @@

+# ------------------------------------------------------------------------------
+# The code is from GLPDepth (https://github.com/vinvino02/GLPDepth).
+# For non-commercial purpose only (research, evaluation etc).
+# ------------------------------------------------------------------------------
+import torch
+import torch.nn as nn
+class SiLogLoss(nn.Module):
+    def __init__(self, lambd=0.5):
+        super().__init__()
+        self.lambd = lambd
+    def forward(self, pred, target):
+        valid_mask = (target > 0).detach()
+        diff_log = torch.log(target[valid_mask]) - torch.log(pred[valid_mask])
+        loss = torch.sqrt(torch.pow(diff_log, 2).mean() -
+                          self.lambd * torch.pow(diff_log.mean(), 2))
+        return loss

depth/utils_depth/logging.py ADDED Viewed

	@@ -0,0 +1,161 @@

+# ------------------------------------------------------------------------------
+# The code is from GLPDepth (https://github.com/vinvino02/GLPDepth).
+# For non-commercial purpose only (research, evaluation etc).
+# ------------------------------------------------------------------------------
+import os
+import cv2
+import sys
+import time
+import numpy as np
+import torch
+TOTAL_BAR_LENGTH = 30.
+last_time = time.time()
+begin_time = last_time
+def progress_bar(current, total, epochs, cur_epoch, msg=None):
+    _, term_width = os.popen('stty size', 'r').read().split()
+    term_width = int(term_width)
+    global last_time, begin_time
+    if current == 0:
+        begin_time = time.time()  # Reset for new bar.
+    cur_len = int(TOTAL_BAR_LENGTH * current / total)
+    rest_len = int(TOTAL_BAR_LENGTH - cur_len) - 1
+    sys.stdout.write(' [')
+    for i in range(cur_len):
+        sys.stdout.write('=')
+    sys.stdout.write('>')
+    for i in range(rest_len):
+        sys.stdout.write('.')
+    sys.stdout.write(']')
+    cur_time = time.time()
+    step_time = cur_time - last_time
+    last_time = cur_time
+    tot_time = cur_time - begin_time
+    remain_time = step_time * (total - current) + \
+        (epochs - cur_epoch) * step_time * total
+    L = []
+    L.append('  Step: %s' % format_time(step_time))
+    L.append(' | Tot: %s' % format_time(tot_time))
+    L.append(' | Rem: %s' % format_time(remain_time))
+    if msg:
+        L.append(' | ' + msg)
+    msg = ''.join(L)
+    sys.stdout.write(msg)
+    for i in range(157 - int(TOTAL_BAR_LENGTH) - len(msg) - 3):
+        sys.stdout.write(' ')
+    # Go back to the center of the bar.
+    for i in range(157 - int(TOTAL_BAR_LENGTH / 2) + 2):
+        sys.stdout.write('\b')
+    sys.stdout.write(' %d/%d ' % (current + 1, total))
+    if current < total - 1:
+        sys.stdout.write('\r')
+    else:
+        sys.stdout.write('\n')
+    sys.stdout.flush()
+class AverageMeter():
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+def format_time(seconds):
+    days = int(seconds / 3600 / 24)
+    seconds = seconds - days * 3600 * 24
+    hours = int(seconds / 3600)
+    seconds = seconds - hours * 3600
+    minutes = int(seconds / 60)
+    seconds = seconds - minutes * 60
+    secondsf = int(seconds)
+    seconds = seconds - secondsf
+    millis = int(seconds * 1000)
+    f = ''
+    i = 1
+    if days > 0:
+        f += str(days) + 'D'
+        i += 1
+    if hours > 0 and i <= 2:
+        f += str(hours) + 'h'
+        i += 1
+    if minutes > 0 and i <= 2:
+        f += str(minutes).zfill(2) + 'm'
+        i += 1
+    if secondsf > 0 and i <= 2:
+        f += str(secondsf).zfill(2) + 's'
+        i += 1
+    if millis > 0 and i <= 2:
+        f += str(millis).zfill(3) + 'ms'
+        i += 1
+    if f == '':
+        f = '0ms'
+    return f
+def display_result(result_dict):
+    line = "\n"
+    line += "=" * 100 + '\n'
+    for metric, value in result_dict.items():
+        line += "{:>10} ".format(metric)
+    line += "\n"
+    for metric, value in result_dict.items():
+        line += "{:10.4f} ".format(value)
+    line += "\n"
+    line += "=" * 100 + '\n'
+    return line
+def save_images(pred, save_path):
+    if len(pred.shape) > 3:
+        pred = pred.squeeze()
+    if isinstance(pred, torch.Tensor):
+        pred = pred.cpu().numpy().astype(np.uint8)
+    if pred.shape[0] < 4:
+        pred = np.transpose(pred, (1, 2, 0))
+    cv2.imwrite(save_path, pred, [cv2.IMWRITE_PNG_COMPRESSION, 0])
+def check_and_make_dirs(paths):
+    if not isinstance(paths, list):
+        paths = [paths]
+    for path in paths:
+        if not os.path.exists(path):
+            os.makedirs(path)
+def log_args_to_txt(log_txt, args):
+    if not os.path.exists(log_txt):
+        with open(log_txt, 'w') as txtfile:
+            args_ = vars(args)
+            args_str = ''
+            for k, v in args_.items():
+                args_str = args_str + str(k) + ':' + str(v) + ',\t\n'
+            txtfile.write(args_str + '\n')

depth/utils_depth/metrics.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# ------------------------------------------------------------------------------
+# The code is from GLPDepth (https://github.com/vinvino02/GLPDepth).
+# For non-commercial purpose only (research, evaluation etc).
+# ------------------------------------------------------------------------------
+import torch
+def eval_depth(pred, target):
+    assert pred.shape == target.shape
+    thresh = torch.max((target / pred), (pred / target))
+    d1 = torch.sum(thresh < 1.25).float() / len(thresh)
+    d2 = torch.sum(thresh < 1.25 ** 2).float() / len(thresh)
+    d3 = torch.sum(thresh < 1.25 ** 3).float() / len(thresh)
+    diff = pred - target
+    diff_log = torch.log(pred) - torch.log(target)
+    abs_rel = torch.mean(torch.abs(diff) / target)
+    sq_rel = torch.mean(torch.pow(diff, 2) / target)
+    rmse = torch.sqrt(torch.mean(torch.pow(diff, 2)))
+    rmse_log = torch.sqrt(torch.mean(torch.pow(diff_log , 2)))
+    log10 = torch.mean(torch.abs(torch.log10(pred) - torch.log10(target)))
+    silog = torch.sqrt(torch.pow(diff_log, 2).mean() - 0.5 * torch.pow(diff_log.mean(), 2))
+    return {'d1': d1.item(), 'd2': d2.item(), 'd3': d3.item(), 'abs_rel': abs_rel.item(),
+            'sq_rel': sq_rel.item(), 'rmse': rmse.item(), 'rmse_log': rmse_log.item(),
+            'log10':log10.item(), 'silog':silog.item()}
+def cropping_img(args, pred, gt_depth):
+    min_depth_eval = args.min_depth_eval
+    max_depth_eval = args.max_depth_eval
+    pred[torch.isinf(pred)] = max_depth_eval
+    pred[torch.isnan(pred)] = min_depth_eval
+    valid_mask = torch.logical_and(
+        gt_depth > min_depth_eval, gt_depth < max_depth_eval)
+    if args.dataset == 'kitti':
+        if args.do_kb_crop:
+            height, width = gt_depth.shape
+            top_margin = int(height - 352)
+            left_margin = int((width - 1216) / 2)
+            gt_depth = gt_depth[top_margin:top_margin +
+                            352, left_margin:left_margin + 1216]
+        if args.kitti_crop:
+            gt_height, gt_width = gt_depth.shape
+            eval_mask = torch.zeros(valid_mask.shape).to(
+                device=valid_mask.device)
+            if args.kitti_crop == 'garg_crop':
+                eval_mask[int(0.40810811 * gt_height):int(0.99189189 * gt_height),
+                          int(0.03594771 * gt_width):int(0.96405229 * gt_width)] = 1
+            elif args.kitti_crop == 'eigen_crop':
+                eval_mask[int(0.3324324 * gt_height):int(0.91351351 * gt_height),
+                          int(0.0359477 * gt_width):int(0.96405229 * gt_width)] = 1
+            else:
+                eval_mask = valid_mask
+    elif args.dataset == 'nyudepthv2':
+        eval_mask = torch.zeros(valid_mask.shape).to(device=valid_mask.device)
+        eval_mask[45:471, 41:601] = 1
+    else:
+        eval_mask = valid_mask
+    valid_mask = torch.logical_and(valid_mask, eval_mask)
+    return pred[valid_mask], gt_depth[valid_mask]

depth/utils_depth/misc.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# ------------------------------------------------------------------------------
+# The code is from ZoeDepth (https://github.com/isl-org/ZoeDepth).
+# For non-commercial purpose only (research, evaluation etc).
+# ------------------------------------------------------------------------------
+from scipy import ndimage
+import math
+import matplotlib
+import matplotlib.cm
+import numpy as np
+import requests
+import torch
+from PIL import Image
+from torchvision.transforms import ToTensor
+def colorize(value, vmin=None, vmax=None, cmap='gray_r', invalid_val=-99, invalid_mask=None, background_color=(128, 128, 128, 255), gamma_corrected=False, value_transform=None):
+    """Converts a depth map to a color image.
+    Args:
+        value (torch.Tensor, numpy.ndarry): Input depth map. Shape: (H, W) or (1, H, W) or (1, 1, H, W). All singular dimensions are squeezed
+        vmin (float, optional): vmin-valued entries are mapped to start color of cmap. If None, value.min() is used. Defaults to None.
+        vmax (float, optional):  vmax-valued entries are mapped to end color of cmap. If None, value.max() is used. Defaults to None.
+        cmap (str, optional): matplotlib colormap to use. Defaults to 'magma_r'.
+        invalid_val (int, optional): Specifies value of invalid pixels that should be colored as 'background_color'. Defaults to -99.
+        invalid_mask (numpy.ndarray, optional): Boolean mask for invalid regions. Defaults to None.
+        background_color (tuple[int], optional): 4-tuple RGB color to give to invalid pixels. Defaults to (128, 128, 128, 255).
+        gamma_corrected (bool, optional): Apply gamma correction to colored image. Defaults to False.
+        value_transform (Callable, optional): Apply transform function to valid pixels before coloring. Defaults to None.
+    Returns:
+        numpy.ndarray, dtype - uint8: Colored depth map. Shape: (H, W, 4)
+    """
+    if isinstance(value, torch.Tensor):
+        value = value.detach().cpu().numpy()
+    value = value.squeeze()
+    if invalid_mask is None:
+        invalid_mask = value == invalid_val
+    mask = np.logical_not(invalid_mask)
+    # normalize
+    vmin = np.percentile(value[mask],2) if vmin is None else vmin
+    vmax = np.percentile(value[mask],85) if vmax is None else vmax
+    if vmin != vmax:
+        value = (value - vmin) / (vmax - vmin)  # vmin..vmax
+    else:
+        # Avoid 0-division
+        value = value * 0.
+    # squeeze last dim if it exists
+    # grey out the invalid values
+    value[invalid_mask] = np.nan
+    cmapper = matplotlib.colormaps.get_cmap(cmap)
+    if value_transform:
+        value = value_transform(value)
+        # value = value / value.max()
+    value = cmapper(value, bytes=True)  # (nxmx4)
+    # img = value[:, :, :]
+    img = value[...]
+    img[invalid_mask] = background_color
+    #     return img.transpose((2, 0, 1))
+    if gamma_corrected:
+        # gamma correction
+        img = img / 255
+        img = np.power(img, 2.2)
+        img = img * 255
+        img = img.astype(np.uint8)
+    return img, vmin, vmax

depth/v1-inference.yaml ADDED Viewed

	@@ -0,0 +1,70 @@

+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder

evp/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .models import UNetWrapper, TextAdapter

evp/models.py ADDED Viewed

	@@ -0,0 +1,349 @@

+from omegaconf import OmegaConf
+import torch as th
+import torch
+import math
+import abc
+from torch import nn, einsum
+from einops import rearrange, repeat
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+from transformers import CLIPTokenizer
+from transformers.models.clip.modeling_clip import CLIPTextConfig, CLIPTextModel, CLIPTextTransformer#, _expand_mask
+from inspect import isfunction
+def exists(val):
+    return val is not None
+def uniq(arr):
+    return{el: True for el in arr}.keys()
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+def register_attention_control(model, controller):
+    def ca_forward(self, place_in_unet):
+        def forward(x, context=None, mask=None):
+            h = self.heads
+            q = self.to_q(x)
+            is_cross = context is not None
+            context = default(context, x)
+            k = self.to_k(context)
+            v = self.to_v(context)
+            q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
+            sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
+            if exists(mask):
+                mask = rearrange(mask, 'b ... -> b (...)')
+                max_neg_value = -torch.finfo(sim.dtype).max
+                mask = repeat(mask, 'b j -> (b h) () j', h=h)
+                sim.masked_fill_(~mask, max_neg_value)
+            # attention, what we cannot get enough of
+            attn = sim.softmax(dim=-1)
+            attn2 = rearrange(attn, '(b h) k c -> h b k c', h=h).mean(0)
+            controller(attn2, is_cross, place_in_unet)
+            out = einsum('b i j, b j d -> b i d', attn, v)
+            out = rearrange(out, '(b h) n d -> b n (h d)', h=h)
+            return self.to_out(out)
+        return forward
+    class DummyController:
+        def __call__(self, *args):
+            return args[0]
+        def __init__(self):
+            self.num_att_layers = 0
+    if controller is None:
+        controller = DummyController()
+    def register_recr(net_, count, place_in_unet):
+        if net_.__class__.__name__ == 'CrossAttention':
+            net_.forward = ca_forward(net_, place_in_unet)
+            return count + 1
+        elif hasattr(net_, 'children'):
+            for net__ in net_.children():
+                count = register_recr(net__, count, place_in_unet)
+        return count
+    cross_att_count = 0
+    sub_nets = model.diffusion_model.named_children()
+    for net in sub_nets:
+        if "input_blocks" in net[0]:
+            cross_att_count += register_recr(net[1], 0, "down")
+        elif "output_blocks" in net[0]:
+            cross_att_count += register_recr(net[1], 0, "up")
+        elif "middle_block" in net[0]:
+            cross_att_count += register_recr(net[1], 0, "mid")
+    controller.num_att_layers = cross_att_count
+class AttentionControl(abc.ABC):
+    def step_callback(self, x_t):
+        return x_t
+    def between_steps(self):
+        return
+    @property
+    def num_uncond_att_layers(self):
+        return 0
+    @abc.abstractmethod
+    def forward (self, attn, is_cross: bool, place_in_unet: str):
+        raise NotImplementedError
+    def __call__(self, attn, is_cross: bool, place_in_unet: str):
+        attn = self.forward(attn, is_cross, place_in_unet)
+        return attn
+    def reset(self):
+        self.cur_step = 0
+        self.cur_att_layer = 0
+    def __init__(self):
+        self.cur_step = 0
+        self.num_att_layers = -1
+        self.cur_att_layer = 0
+class AttentionStore(AttentionControl):
+    @staticmethod
+    def get_empty_store():
+        return {"down_cross": [], "mid_cross": [], "up_cross": [],
+                "down_self": [],  "mid_self": [],  "up_self": []}
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        key = f"{place_in_unet}_{'cross' if is_cross else 'self'}"
+        if attn.shape[1] <= (self.max_size) ** 2:  # avoid memory overhead
+            self.step_store[key].append(attn)
+        return attn
+    def between_steps(self):
+        if len(self.attention_store) == 0:
+            self.attention_store = self.step_store
+        else:
+            for key in self.attention_store:
+                for i in range(len(self.attention_store[key])):
+                    self.attention_store[key][i] += self.step_store[key][i]
+        self.step_store = self.get_empty_store()
+    def get_average_attention(self):
+        average_attention = {key: [item for item in self.step_store[key]] for key in self.step_store}
+        return average_attention
+    def reset(self):
+        super(AttentionStore, self).reset()
+        self.step_store = self.get_empty_store()
+        self.attention_store = {}
+    def __init__(self, base_size=64, max_size=None):
+        super(AttentionStore, self).__init__()
+        self.step_store = self.get_empty_store()
+        self.attention_store = {}
+        self.base_size = base_size
+        if max_size is None:
+            self.max_size = self.base_size // 2
+        else:
+            self.max_size = max_size
+def register_hier_output(model):
+    self = model.diffusion_model
+    from ldm.modules.diffusionmodules.util import checkpoint, timestep_embedding
+    def forward(x, timesteps=None, context=None, y=None,**kwargs):
+        """
+        Apply the model to an input batch.
+        :param x: an [N x C x ...] Tensor of inputs.
+        :param timesteps: a 1-D batch of timesteps.
+        :param context: conditioning plugged in via crossattn
+        :param y: an [N] Tensor of labels, if class-conditional.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+        assert (y is not None) == (
+            self.num_classes is not None
+        ), "must specify y if and only if the model is class-conditional"
+        hs = []
+        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+        emb = self.time_embed(t_emb)
+        if self.num_classes is not None:
+            assert y.shape == (x.shape[0],)
+            emb = emb + self.label_emb(y)
+        h = x.type(self.dtype)
+        for module in self.input_blocks:
+            # import pdb; pdb.set_trace()
+            if context.shape[1]==2:
+                h = module(h, emb, context[:,0,:].unsqueeze(1))
+            else:
+                h = module(h, emb, context)
+            hs.append(h)
+        if context.shape[1]==2:
+            h = self.middle_block(h, emb, context[:,0,:].unsqueeze(1))
+        else:
+            h = self.middle_block(h, emb, context)
+        out_list = []
+        for i_out, module in enumerate(self.output_blocks):
+            h = th.cat([h, hs.pop()], dim=1)
+            if context.shape[1]==2:
+                h = module(h, emb, context[:,1,:].unsqueeze(1))
+            else:
+                h = module(h, emb, context)
+            if i_out in [1, 4, 7]:
+                out_list.append(h)
+        h = h.type(x.dtype)
+        out_list.append(h)
+        return out_list
+    self.forward = forward
+class UNetWrapper(nn.Module):
+    def __init__(self, unet, use_attn=True, base_size=512, max_attn_size=None, attn_selector='up_cross+down_cross') -> None:
+        super().__init__()
+        self.unet = unet
+        self.attention_store = AttentionStore(base_size=base_size // 8, max_size=max_attn_size)
+        self.size16 = base_size // 32
+        self.size32 = base_size // 16
+        self.size64 = base_size // 8
+        self.use_attn = use_attn
+        if self.use_attn:
+            register_attention_control(unet, self.attention_store)
+        register_hier_output(unet)
+        self.attn_selector = attn_selector.split('+')
+    def forward(self, *args, **kwargs):
+        if self.use_attn:
+            self.attention_store.reset()
+        out_list = self.unet(*args, **kwargs)
+        if self.use_attn:
+            avg_attn = self.attention_store.get_average_attention()
+            attn16, attn32, attn64 = self.process_attn(avg_attn)
+            out_list[1] = torch.cat([out_list[1], attn16], dim=1)
+            out_list[2] = torch.cat([out_list[2], attn32], dim=1)
+            if attn64 is not None:
+                out_list[3] = torch.cat([out_list[3], attn64], dim=1)
+        return out_list[::-1]
+    def process_attn(self, avg_attn):
+        attns = {self.size16: [], self.size32: [], self.size64: []}
+        for k in self.attn_selector:
+            for up_attn in avg_attn[k]:
+                size = int(math.sqrt(up_attn.shape[1]))
+                attns[size].append(rearrange(up_attn, 'b (h w) c -> b c h w', h=size))
+        attn16 = torch.stack(attns[self.size16]).mean(0)
+        attn32 = torch.stack(attns[self.size32]).mean(0)
+        if len(attns[self.size64]) > 0:
+            attn64 = torch.stack(attns[self.size64]).mean(0)
+        else:
+            attn64 = None
+        return attn16, attn32, attn64
+class TextAdapter(nn.Module):
+    def __init__(self, text_dim=768, hidden_dim=None):
+        super().__init__()
+        if hidden_dim is None:
+            hidden_dim = text_dim
+        self.fc = nn.Sequential(
+            nn.Linear(text_dim, hidden_dim),
+            nn.GELU(),
+            nn.Linear(hidden_dim, text_dim)
+        )
+    def forward(self, latents, texts, gamma):
+        n_class, channel = texts.shape
+        bs = latents.shape[0]
+        texts_after = self.fc(texts)
+        texts = texts + gamma * texts_after
+        texts = repeat(texts, 'n c -> b n c', b=bs)
+        return texts
+class TextAdapterRefer(nn.Module):
+    def __init__(self, text_dim=768):
+        super().__init__()
+        self.fc = nn.Sequential(
+            nn.Linear(text_dim, text_dim),
+            nn.GELU(),
+            nn.Linear(text_dim, text_dim)
+        )
+    def forward(self, latents, texts, gamma):
+        texts_after = self.fc(texts)
+        texts = texts + gamma * texts_after
+        return texts
+class TextAdapterDepth(nn.Module):
+    def __init__(self, text_dim=768):
+        super().__init__()
+        self.fc = nn.Sequential(
+            nn.Linear(text_dim, text_dim),
+            nn.GELU(),
+            nn.Linear(text_dim, text_dim)
+        )
+    def forward(self, latents, texts, gamma):
+        # use the gamma to blend
+        n_sen, channel = texts.shape
+        bs = latents.shape[0]
+        texts_after = self.fc(texts)
+        texts = texts + gamma * texts_after
+        texts = repeat(texts, 'n c -> n b c', b=1)
+        return texts
+class FrozenCLIPEmbedder(nn.Module):
+    """Uses the CLIP transformer encoder for text (from Hugging Face)"""
+    def __init__(self, version="openai/clip-vit-large-patch14", device="cuda", max_length=77, pool=True):
+        super().__init__()
+        self.tokenizer = CLIPTokenizer.from_pretrained(version)
+        self.transformer = CLIPTextModel.from_pretrained(version)
+        self.device = device
+        self.max_length = max_length
+        self.freeze()
+        self.pool = pool
+    def freeze(self):
+        self.transformer = self.transformer.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+    def forward(self, text):
+        batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
+                                        return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
+        tokens = batch_encoding["input_ids"].to(self.device)
+        outputs = self.transformer(input_ids=tokens)
+        if self.pool:
+            z = outputs.pooler_output
+        else:
+            z = outputs.last_hidden_state
+        return z
+    def encode(self, text):
+        return self(text)

refer/README.md ADDED Viewed

	@@ -0,0 +1,78 @@

+# Referring Image Segmentation
+## Getting Started
+1. Install the required packages.
+```
+pip install -r requirements.txt
+```
+2. Prepare RefCOCO datasets following [LAVT](https://github.com/yz93/LAVT-RIS).
+* Download COCO 2014 Train Images [83K/13GB] from [COCO](https://cocodataset.org/#download), and extract `train2014.zip` to `./refer/data/images/mscoco/images`
+* Follow the instructions in `./refer` to download and extract `refclef.zip, refcoco.zip, refcoco+.zip, refcocog.zip` to `./refer/data`
+Your dataset directory should be:
+```
+refer/
+├──data/
+│  ├── images/mscoco/images/
+│  ├── refclef
+│  ├── refcoco
+│  ├── refcoco+
+│  ├── refcocog
+├──evaluation/
+├──...
+```
+## Results and Fine-tuned Models of EVP
+EVP achieves 76.35 overall IoU and 77.61 mean IoU on the validation set of RefCOCO.
+## Training
+We count the max length of referring sentences and set the token length of lenguage model accrodingly. The checkpoint of the best epoch would be saved at `./checkpoints/`.
+* Train on RefCOCO
+```
+bash train.sh refcoco /path/to/logdir <NUM_GPUS> --token_length 40
+```
+* Train on RefCOCO+
+```
+bash train.sh refcoco+ /path/to/logdir <NUM_GPUS> --token_length 40
+```
+* Train on RefCOCOg
+```
+bash train.sh refcocog /path/to/logdir <NUM_GPUS> --token_length 77 --splitBy umd
+```
+## Evaluation
+* Evaluate on RefCOCO
+```
+bash test.sh refcoco /path/to/evp_ris_refcoco.pth --token_length 40
+```
+* Evaluate on RefCOCO+
+```
+bash test.sh refcoco+ /path/to/evp_ris_refcoco+.pth --token_length 40
+```
+* Evaluate on RefCOCOg
+```
+bash test.sh refcocog /path/to/evp_ris_gref.pth --token_length 77 --splitBy umd
+```
+## Custom inference
+```
+PYTHONPATH="../":$PYTHONPATH python inference.py --img_path test_img.jpg --resume refcoco.pth --token_length 40 --prompt 'green plant'
+```

refer/args.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import argparse
+def get_parser():
+    parser = argparse.ArgumentParser(description='EVP training and testing')
+    parser.add_argument('--amsgrad', action='store_true',
+                        help='if true, set amsgrad to True in an Adam or AdamW optimizer.')
+    parser.add_argument('-b', '--batch-size', default=8, type=int)
+    parser.add_argument('--ck_bert', default='bert-base-uncased', help='pre-trained BERT weights')
+    parser.add_argument('--dataset', default='refcoco', help='refcoco, refcoco+, or refcocog')
+    parser.add_argument('--ddp_trained_weights', action='store_true',
+                        help='Only needs specified when testing,'
+                             'whether the weights to be loaded are from a DDP-trained model')
+    parser.add_argument('--device', default='cuda:0', help='device')  # only used when testing on a single machine
+    parser.add_argument('--epochs', default=40, type=int, metavar='N', help='number of total epochs to run')
+    parser.add_argument('--fusion_drop', default=0.0, type=float, help='dropout rate for PWAMs')
+    parser.add_argument('--img_size', default=480, type=int, help='input image size')
+    parser.add_argument("--local_rank", type=int, default=0, help='local rank for DistributedDataParallel')
+    parser.add_argument("--local-rank", type=int, default=0, help='local rank for DistributedDataParallel')
+    parser.add_argument('--lr', default=0.00005, type=float, help='the initial learning rate')
+    parser.add_argument('--model_id', default='evp', help='name to identify the model')
+    parser.add_argument('--output-dir', default='./checkpoints/', help='path where to save checkpoint weights')
+    parser.add_argument('--pin_mem', action='store_true',
+                        help='If true, pin memory when using the data loader.')
+    parser.add_argument('--pretrained_swin_weights', default='',
+                        help='path to pre-trained Swin backbone weights')
+    parser.add_argument('--print-freq', default=10, type=int, help='print frequency')
+    parser.add_argument('--refer_data_root', default='./refer/data/', help='REFER dataset root directory')
+    parser.add_argument('--resume', default='', help='resume from checkpoint')
+    parser.add_argument('--split', default='val')
+    parser.add_argument('--splitBy', default='unc')
+    parser.add_argument('--wd', '--weight-decay', default=1e-2, type=float, metavar='W', help='weight decay',
+                        dest='weight_decay')
+    parser.add_argument('-j', '--workers', default=8, type=int, metavar='N', help='number of data loading workers')
+    parser.add_argument('--token_length', default=77, type=int)
+    return parser
+if __name__ == "__main__":
+    parser = get_parser()
+    args_dict = parser.parse_args()

refer/inference.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import os
+import cv2
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+from models_refer.model import EVPRefer
+from args import get_parser
+import glob
+import utils
+import torchvision.transforms as transforms
+from PIL import Image
+import torch.nn.functional as F
+from transformers import CLIPTokenizer
+def main():
+    parser = get_parser()
+    parser.add_argument('--img_path',  type=str)
+    parser.add_argument('--prompt',  type=str)
+    args = parser.parse_args()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+    model = EVPRefer(sd_path='../checkpoints/v1-5-pruned-emaonly.ckpt')
+    cudnn.benchmark = True
+    model.to(device)
+    model_weight = torch.load(args.resume)['model']
+    if 'module' in next(iter(model_weight.items()))[0]:
+        model_weight = OrderedDict((k[7:], v) for k, v in model_weight.items())
+    model.load_state_dict(model_weight, strict=False)
+    model.eval()
+    img_path = args.img_path
+    image = cv2.imread(img_path)
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    image_t = transforms.ToTensor()(image).unsqueeze(0).to(device)
+    image_t = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])(image_t)
+    shape = image_t.shape
+    image_t = torch.nn.functional.interpolate(image_t, (512,512), mode='bilinear', align_corners=True)
+    input_ids = tokenizer(text=args.prompt, truncation=True, max_length=args.token_length, return_length=True,
+            return_overflowing_tokens=False, padding="max_length", return_tensors="pt")['input_ids'].to(device)
+    with torch.no_grad():
+        pred = model(image_t, input_ids)
+    pred = torch.nn.functional.interpolate(pred, shape[2:], mode='bilinear', align_corners=True)
+    output_mask = pred.cpu().argmax(1).data.numpy().squeeze()
+    alpha = 0.65
+    image[output_mask == 0] = (image[output_mask == 0]*alpha).astype(np.uint8)
+    contours, _ = cv2.findContours(output_mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    cv2.drawContours(image, contours, -1, (0, 255, 0), 2)
+    Image.fromarray(image.astype(np.uint8)).save('res.png')
+    return 0
+if __name__ == '__main__':
+    main()

refer/models_refer/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .model import EVPRefer

refer/models_refer/model.py ADDED Viewed

	@@ -0,0 +1,301 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import sys
+from ldm.util import instantiate_from_config
+from transformers.models.clip.modeling_clip import CLIPTextModel
+from omegaconf import OmegaConf
+from lib.mask_predictor import SimpleDecoding
+from evp.models import UNetWrapper, TextAdapterRefer
+def icnr(x, scale=2, init=nn.init.kaiming_normal_):
+    """
+    Checkerboard artifact free sub-pixel convolution
+    https://arxiv.org/abs/1707.02937
+    """
+    ni,nf,h,w = x.shape
+    ni2 = int(ni/(scale**2))
+    k = init(torch.zeros([ni2,nf,h,w])).transpose(0, 1)
+    k = k.contiguous().view(ni2, nf, -1)
+    k = k.repeat(1, 1, scale**2)
+    k = k.contiguous().view([nf,ni,h,w]).transpose(0, 1)
+    x.data.copy_(k)
+class PixelShuffle(nn.Module):
+    """
+    Real-Time Single Image and Video Super-Resolution
+    https://arxiv.org/abs/1609.05158
+    """
+    def __init__(self, n_channels, scale):
+        super(PixelShuffle, self).__init__()
+        self.conv = nn.Conv2d(n_channels, n_channels*(scale**2), kernel_size=1)
+        icnr(self.conv.weight)
+        self.shuf = nn.PixelShuffle(scale)
+        self.relu = nn.ReLU()
+    def forward(self,x):
+        x = self.shuf(self.relu(self.conv(x)))
+        return x
+class AttentionModule(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(AttentionModule, self).__init__()
+        # Convolutional Layers
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        # Group Normalization
+        self.group_norm = nn.GroupNorm(20, out_channels)
+        # ReLU Activation
+        self.relu = nn.ReLU()
+        # Spatial Attention
+        self.spatial_attention = nn.Sequential(
+            nn.Conv2d(in_channels, 1, kernel_size=1),
+            nn.Sigmoid()
+        )
+    def forward(self, x):
+        # Apply spatial attention
+        spatial_attention = self.spatial_attention(x)
+        x = x * spatial_attention
+        # Apply convolutional layer
+        x = self.conv1(x)
+        x = self.group_norm(x)
+        x = self.relu(x)
+        return x
+class AttentionDownsamplingModule(nn.Module):
+    def __init__(self, in_channels, out_channels, scale_factor=2):
+        super(AttentionDownsamplingModule, self).__init__()
+        # Spatial Attention
+        self.spatial_attention = nn.Sequential(
+            nn.Conv2d(in_channels, 1, kernel_size=1),
+            nn.Sigmoid()
+        )
+        # Channel Attention
+        self.channel_attention = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(in_channels, in_channels // 8, kernel_size=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(in_channels // 8, in_channels, kernel_size=1),
+            nn.Sigmoid()
+        )
+        # Convolutional Layers
+        if scale_factor == 2:
+            self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        elif scale_factor == 4:
+            self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=2, padding=1)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=2, padding=1)
+        # Group Normalization
+        self.group_norm = nn.GroupNorm(20, out_channels)
+        # ReLU Activation
+        self.relu = nn.ReLU(inplace=True)
+    def forward(self, x):
+        # Apply spatial attention
+        spatial_attention = self.spatial_attention(x)
+        x = x * spatial_attention
+        # Apply channel attention
+        channel_attention = self.channel_attention(x)
+        x = x * channel_attention
+        # Apply convolutional layers
+        x = self.conv1(x)
+        x = self.group_norm(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.group_norm(x)
+        x = self.relu(x)
+        return x
+class AttentionUpsamplingModule(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(AttentionUpsamplingModule, self).__init__()
+        # Spatial Attention for outs[2]
+        self.spatial_attention = nn.Sequential(
+            nn.Conv2d(in_channels, 1, kernel_size=1),
+            nn.Sigmoid()
+        )
+        # Channel Attention for outs[2]
+        self.channel_attention = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(in_channels, in_channels // 8, kernel_size=1),
+            nn.ReLU(),
+            nn.Conv2d(in_channels // 8, in_channels, kernel_size=1),
+            nn.Sigmoid()
+        )
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        # Group Normalization
+        self.group_norm = nn.GroupNorm(20, out_channels)
+        # ReLU Activation
+        self.relu = nn.ReLU()
+        self.upscale = PixelShuffle(in_channels, 2)
+    def forward(self, x):
+        # Apply spatial attention
+        spatial_attention = self.spatial_attention(x)
+        x = x * spatial_attention
+        # Apply channel attention
+        channel_attention = self.channel_attention(x)
+        x = x * channel_attention
+        # Apply convolutional layers
+        x = self.conv1(x)
+        x = self.group_norm(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.group_norm(x)
+        x = self.relu(x)
+        # Upsample
+        x = self.upscale(x)
+        return x
+class ConvLayer(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(ConvLayer, self).__init__()
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(in_channels, out_channels, 1),
+            nn.GroupNorm(20, out_channels),
+            nn.ReLU(),
+        )
+    def forward(self, x):
+        x = self.conv1(x)
+        return x
+class InverseMultiAttentiveFeatureRefinement(nn.Module):
+    def __init__(self, in_channels_list):
+        super(InverseMultiAttentiveFeatureRefinement, self).__init__()
+        self.layer1 = AttentionModule(in_channels_list[0], in_channels_list[0])
+        self.layer2 = AttentionDownsamplingModule(in_channels_list[0], in_channels_list[0]//2, scale_factor = 2)
+        self.layer3 = ConvLayer(in_channels_list[0]//2 + in_channels_list[1], in_channels_list[1])
+        self.layer4 = AttentionDownsamplingModule(in_channels_list[1], in_channels_list[1]//2, scale_factor = 2)
+        self.layer5 = ConvLayer(in_channels_list[1]//2 + in_channels_list[2], in_channels_list[2])
+        self.layer6 = AttentionDownsamplingModule(in_channels_list[2], in_channels_list[2]//2, scale_factor = 2)
+        self.layer7 = ConvLayer(in_channels_list[2]//2 + in_channels_list[3], in_channels_list[3])
+        '''
+        self.layer8 = AttentionUpsamplingModule(in_channels_list[3], in_channels_list[3])
+        self.layer9 = ConvLayer(in_channels_list[2] + in_channels_list[3], in_channels_list[2])
+        self.layer10 = AttentionUpsamplingModule(in_channels_list[2], in_channels_list[2])
+        self.layer11 = ConvLayer(in_channels_list[1] + in_channels_list[2], in_channels_list[1])
+        self.layer12 = AttentionUpsamplingModule(in_channels_list[1], in_channels_list[1])
+        self.layer13 = ConvLayer(in_channels_list[0] + in_channels_list[1], in_channels_list[0])
+        '''
+    def forward(self, inputs):
+        x_c4, x_c3, x_c2, x_c1 = inputs
+        x_c4 = self.layer1(x_c4)
+        x_c4_3 = self.layer2(x_c4)
+        x_c3 = torch.cat([x_c4_3, x_c3], dim=1)
+        x_c3 = self.layer3(x_c3)
+        x_c3_2 = self.layer4(x_c3)
+        x_c2 = torch.cat([x_c3_2, x_c2], dim=1)
+        x_c2 = self.layer5(x_c2)
+        x_c2_1 = self.layer6(x_c2)
+        x_c1 = torch.cat([x_c2_1, x_c1], dim=1)
+        x_c1 = self.layer7(x_c1)
+        '''
+        x_c1_2 = self.layer8(x_c1)
+        x_c2 = torch.cat([x_c1_2, x_c2], dim=1)
+        x_c2 = self.layer9(x_c2)
+        x_c2_3 = self.layer10(x_c2)
+        x_c3 = torch.cat([x_c2_3, x_c3], dim=1)
+        x_c3 = self.layer11(x_c3)
+        x_c3_4 = self.layer12(x_c3)
+        x_c4 = torch.cat([x_c3_4, x_c4], dim=1)
+        x_c4 = self.layer13(x_c4)
+        '''
+        return [x_c4, x_c3, x_c2, x_c1]
+class EVPRefer(nn.Module):
+    """Encoder Decoder segmentors.
+    EncoderDecoder typically consists of backbone, decode_head, auxiliary_head.
+    Note that auxiliary_head is only used for deep supervision during training,
+    which could be dumped during inference.
+    """
+    def __init__(self,
+                 sd_path=None,
+                 base_size=512,
+                 token_embed_dim=768,
+                 neck_dim=[320,680,1320,1280],
+                 **args):
+        super().__init__()
+        config = OmegaConf.load('./v1-inference.yaml')
+        config.model.params.ckpt_path = f'{sd_path}'
+        sd_model = instantiate_from_config(config.model)
+        self.encoder_vq = sd_model.first_stage_model
+        self.unet = UNetWrapper(sd_model.model, base_size=base_size)
+        del sd_model.cond_stage_model
+        del self.encoder_vq.decoder
+        for param in self.encoder_vq.parameters():
+            param.requires_grad = True
+        self.text_adapter = TextAdapterRefer(text_dim=token_embed_dim)
+        self.classifier = SimpleDecoding(dims=neck_dim)
+        self.gamma = nn.Parameter(torch.ones(token_embed_dim) * 1e-4)
+        self.aggregation = InverseMultiAttentiveFeatureRefinement([320,680,1320,1280])
+        self.clip_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
+        for param in self.clip_model.parameters():
+            param.requires_grad = True
+    def forward(self, img, sentences):
+        input_shape = img.shape[-2:]
+        latents = self.encoder_vq.encode(img).mode()
+        latents = latents / 4.7164
+        l_feats = self.clip_model(input_ids=sentences).last_hidden_state
+        c_crossattn = self.text_adapter(latents, l_feats, self.gamma) # NOTE: here the c_crossattn should be expand_dim as latents
+        t = torch.ones((img.shape[0],), device=img.device).long()
+        outs = self.unet(latents, t, c_crossattn=[c_crossattn])
+        outs = self.aggregation(outs)
+        x_c1, x_c2, x_c3, x_c4 = outs
+        x = self.classifier(x_c4, x_c3, x_c2, x_c1)
+        x = F.interpolate(x, size=input_shape, mode='bilinear', align_corners=True)
+        return x
+    def get_latent(self, x):
+        return self.encoder_vq.encode(x).mode()

refer/requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+requests
+filelock
+tqdm
+timm
+ftfy
+regex
+scipy
+scikit-image
+pycocotools==2.0.2
+opencv-python==4.5.3.56
+tokenizers
+h5py

refer/transforms.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import numpy as np
+from PIL import Image
+import random
+import torch
+from torchvision import transforms as T
+from torchvision.transforms import functional as F
+import warnings
+warnings.filterwarnings("ignore")
+def pad_if_smaller(img, size, fill=0):
+    min_size = min(img.size)
+    if min_size < size:
+        ow, oh = img.size
+        padh = size - oh if oh < size else 0
+        padw = size - ow if ow < size else 0
+        img = F.pad(img, (0, 0, padw, padh), fill=fill)
+    return img
+class Compose(object):
+    def __init__(self, transforms):
+        self.transforms = transforms
+    def __call__(self, image, target):
+        for t in self.transforms:
+            image, target = t(image, target)
+        return image, target
+class Resize(object):
+    def __init__(self, h, w):
+        self.h = h
+        self.w = w
+    def __call__(self, image, target):
+        image = F.resize(image, (self.h, self.w))
+        # If size is a sequence like (h, w), the output size will be matched to this.
+        # If size is an int, the smaller edge of the image will be matched to this number maintaining the aspect ratio
+        target = F.resize(target, (self.h, self.w))
+        return image, target
+class RandomResize(object):
+    def __init__(self, min_size, max_size=None):
+        self.min_size = min_size
+        if max_size is None:
+            max_size = min_size
+        self.max_size = max_size
+    def __call__(self, image, target):
+        size = random.randint(self.min_size, self.max_size)  # Return a random integer N such that a <= N <= b. Alias for randrange(a, b+1)
+        image = F.resize(image, size)
+        # If size is a sequence like (h, w), the output size will be matched to this.
+        # If size is an int, the smaller edge of the image will be matched to this number maintaining the aspect ratio
+        target = F.resize(target, size)
+        return image, target
+class RandomHorizontalFlip(object):
+    def __init__(self, flip_prob):
+        self.flip_prob = flip_prob
+    def __call__(self, image, target):
+        if random.random() < self.flip_prob:
+            image = F.hflip(image)
+            target = F.hflip(target)
+        return image, target
+class RandomCrop(object):
+    def __init__(self, size):
+        self.size = size
+    def __call__(self, image, target):
+        image = pad_if_smaller(image, self.size)
+        target = pad_if_smaller(target, self.size, fill=255)
+        crop_params = T.RandomCrop.get_params(image, (self.size, self.size))
+        image = F.crop(image, *crop_params)
+        target = F.crop(target, *crop_params)
+        return image, target
+class CenterCrop(object):
+    def __init__(self, size):
+        self.size = size
+    def __call__(self, image, target):
+        image = F.center_crop(image, self.size)
+        target = F.center_crop(target, self.size)
+        return image, target
+class ToTensor(object):
+    def __call__(self, image, target):
+        image = F.to_tensor(image)
+        target = torch.as_tensor(np.asarray(target).copy(), dtype=torch.int64)
+        return image, target
+class RandomAffine(object):
+    def __init__(self, angle, translate, scale, shear, resample=0, fillcolor=None):
+        self.angle = angle
+        self.translate = translate
+        self.scale = scale
+        self.shear = shear
+        self.resample = resample
+        self.fillcolor = fillcolor
+    def __call__(self, image, target):
+        affine_params = T.RandomAffine.get_params(self.angle, self.translate, self.scale, self.shear, image.size)
+        image = F.affine(image, *affine_params)
+        target = F.affine(target, *affine_params)
+        return image, target
+class Normalize(object):
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+    def __call__(self, image, target):
+        image = F.normalize(image, mean=self.mean, std=self.std)
+        return image, target

refer/utils.py ADDED Viewed

	@@ -0,0 +1,222 @@

+from __future__ import print_function
+from collections import defaultdict, deque
+import datetime
+import math
+import time
+import torch
+import torch.distributed as dist
+import torch.backends.cudnn as cudnn
+import errno
+import os
+import sys
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_avail_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
+        dist.barrier()
+        dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+    @property
+    def global_avg(self):
+        return self.total / self.count
+    @property
+    def max(self):
+        return max(self.deque)
+    @property
+    def value(self):
+        return self.deque[-1]
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value)
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append(
+                "{}: {}".format(name, str(meter))
+            )
+        return self.delimiter.join(loss_str)
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+    def log_every(self, iterable, print_freq, header=None):
+        i = 0
+        if not header:
+            header = ''
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt='{avg:.4f}')
+        data_time = SmoothedValue(fmt='{avg:.4f}')
+        space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
+        log_msg = self.delimiter.join([
+            header,
+            '[{0' + space_fmt + '}/{1}]',
+            'eta: {eta}',
+            '{meters}',
+            'time: {time}',
+            'data: {data}',
+            'max mem: {memory:.0f}'
+        ])
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                print(log_msg.format(
+                    i, len(iterable), eta=eta_string,
+                    meters=str(self),
+                    time=str(iter_time), data=str(data_time),
+                    memory=torch.cuda.max_memory_allocated() / MB))
+                sys.stdout.flush()
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print('{} Total time: {}'.format(header, total_time_str))
+def mkdir(path):
+    try:
+        os.makedirs(path)
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            raise
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+    __builtin__.print = print
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+def is_main_process():
+    return get_rank() == 0
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+def init_distributed_mode(args):
+    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        rank = int(os.environ["RANK"])
+        world_size = int(os.environ['WORLD_SIZE'])
+        print(f"RANK and WORLD_SIZE in environment: {rank}/{world_size}")
+    else:
+        rank = -1
+        world_size = -1
+    torch.cuda.set_device(args.local_rank)
+    torch.distributed.init_process_group(backend='nccl', init_method='env://', world_size=world_size, rank=rank)
+    torch.distributed.barrier()
+    setup_for_distributed(is_main_process())
+    if args.output_dir:
+        mkdir(args.output_dir)
+    if args.model_id:
+        mkdir(os.path.join('./models/', args.model_id))

refer/v1-inference.yaml ADDED Viewed

	@@ -0,0 +1,70 @@

+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder