Spaces:

befozg
/

PortraitTransfer

Running

App Files Files Community

befozg commited on Sep 18, 2023

Commit

f0de4e8

1 Parent(s): 9967c2f

added initial portrait transfer app

Browse files

Files changed (10) hide show

.gitignore +174 -0
app.py +107 -0
requirements.txt +38 -0
slider.html +137 -0
tools/__init__.py +3 -0
tools/inference.py +56 -0
tools/model.py +296 -0
tools/normalizer.py +261 -0
tools/stylematte.py +506 -0
tools/util.py +345 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,174 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+config/*
+trainer/__pycache__/
+trainer/__pycache__/*
+__pycache__/*
+checkpoints/*.pth
+*/*.pth
+*/checkpoints/best_pure.pth
+checkpoints/best_pure.pth
+*.ipynb
+.ipynb_checkpoints/*
+flagged/
+assets/

app.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import gradio as gr
+from tools import Inference, Matting, log
+from omegaconf import OmegaConf
+import os
+import sys
+import numpy as np
+import torchvision.transforms.functional as tf
+from PIL import Image
+args = OmegaConf.load(os.path.join(f"./config/test.yaml"))
+global_comp = None
+global_mask = None
+log("Model loading")
+phnet = Inference(**args)
+stylematte = Matting(**args)
+log("Model loaded")
+def harmonize(comp, mask):
+    log("Inference started")
+    if comp is None or mask is None:
+        log("Empty source")
+        return np.zeros((16, 16, 3))
+    comp = comp.convert('RGB')
+    mask = mask.convert('1')
+    in_shape = comp.size[::-1]
+    comp = tf.resize(comp, [args.image_size, args.image_size])
+    mask = tf.resize(mask, [args.image_size, args.image_size])
+    compt = tf.to_tensor(comp)
+    maskt = tf.to_tensor(mask)
+    res = phnet.harmonize(compt, maskt)
+    res = tf.resize(res, in_shape)
+    log("Inference finished")
+    return np.uint8((res*255)[0].permute(1, 2, 0).numpy())
+def extract_matte(img, back):
+    mask, fg = stylematte.extract(img)
+    fg_pil = Image.fromarray(np.uint8(fg))
+    composite = fg + (1 - mask[:, :, None]) * \
+        np.array(back.resize(mask.shape[::-1]))
+    composite_pil = Image.fromarray(np.uint8(composite))
+    global_comp = composite_pil
+    global_mask = mask
+    return [composite_pil, mask, fg_pil]
+def css(height=3, scale=2):
+    return f".output_image {{height: {height}rem !important; width: {scale}rem !important;}}"
+with gr.Blocks() as demo:
+    gr.Markdown(
+        """
+    # Welcome to portrait transfer demo app!
+    Select source portrait image and new background.
+    """)
+    btn_compose = gr.Button(value="Compose")
+    with gr.Row():
+        input_ui = gr.Image(
+            type="numpy", label='Source image to extract foreground')
+        back_ui = gr.Image(type="pil", label='The new background')
+    gr.Examples(
+        examples=[["./assets/comp.jpg", "./assets/back.jpg"]],
+        inputs=[input_ui, back_ui],
+    )
+    gr.Markdown(
+        """
+    ## Resulting alpha matte and extracted foreground.
+    """)
+    with gr.Row():
+        matte_ui = gr.Image(type="pil", label='Alpha matte')
+        fg_ui = gr.Image(type="pil", image_mode='RGBA',
+                         label='Extracted foreground')
+    gr.Markdown(
+        """
+    ## Click the button and compare the composite with the harmonized version.
+    """)
+    btn_harmonize = gr.Button(value="Harmonize composite")
+    with gr.Row():
+        composite_ui = gr.Image(type="pil", label='Composite')
+        harmonized_ui = gr.Image(
+            type="pil", label='Harmonized composite', css=css(3, 3))
+    btn_compose.click(extract_matte, inputs=[input_ui, back_ui], outputs=[
+                      composite_ui, matte_ui, fg_ui])
+    btn_harmonize.click(harmonize, inputs=[
+                        composite_ui, matte_ui], outputs=[harmonized_ui])
+log("Interface created")
+demo.launch(share=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,38 @@

+gradio==3.30.0
+gradio_client==0.2.4
+huggingface-hub==0.14.1
+imageio==2.25.1
+imgcat==0.5.0
+ipykernel==6.16.0
+ipython==8.5.0
+ipywidgets==8.0.2
+kiwisolver==1.4.2
+kornia==0.6.9
+legacy==0.1.6
+numpy==1.21.6
+omegaconf==2.2.3
+opencv-python==4.5.5.62
+opencv-python-headless==4.7.0.68
+packaging==21.3
+pandas==1.4.2
+parso==0.8.3
+Pillow==9.4.0
+protobuf==3.20.1
+Pygments==2.13.0
+PyMatting==1.1.8
+pyparsing==3.0.9
+pyrsistent==0.19.3
+scikit-image==0.19.3
+scikit-learn==1.1.1
+scipy==1.10.0
+seaborn==0.12.2
+sklearn==0.0
+sniffio==1.3.0
+soupsieve==2.4
+timm==0.6.12
+torch==1.11.0
+torchaudio==0.11.0
+torchvision==0.12.0
+tornado==6.2
+tqdm==4.64.1
+transformers==4.28.1

slider.html ADDED Viewed

	@@ -0,0 +1,137 @@

+<!DOCTYPE html>
+<html>
+<head>
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<style>
+* {box-sizing: border-box;}
+.img-comp-container {
+  position: relative;
+  height: 200px; /*should be the same height as the images*/
+}
+.img-comp-img {
+  position: absolute;
+  width: auto;
+  height: auto;
+  overflow:hidden;
+}
+.img-comp-img img {
+  display:block;
+  vertical-align:middle;
+}
+.img-comp-slider {
+  position: absolute;
+  z-index:9;
+  cursor: ew-resize;
+  /*set the appearance of the slider:*/
+  width: 40px;
+  height: 40px;
+  background-color: #2196F3;
+  opacity: 0.7;
+  border-radius: 50%;
+}
+</style>
+<script>
+function initComparisons() {
+  var x, i;
+  /*find all elements with an "overlay" class:*/
+  x = document.getElementsByClassName("img-comp-overlay");
+  for (i = 0; i < x.length; i++) {
+    /*once for each "overlay" element:
+    pass the "overlay" element as a parameter when executing the compareImages function:*/
+    compareImages(x[i]);
+  }
+  function compareImages(img) {
+    var slider, img, clicked = 0, w, h;
+    /*get the width and height of the img element*/
+    w = img.offsetWidth;
+    h = img.offsetHeight;
+    /*set the width of the img element to 50%:*/
+    img.style.width = (w / 2) + "px";
+    /*create slider:*/
+    slider = document.createElement("DIV");
+    slider.setAttribute("class", "img-comp-slider");
+    /*insert slider*/
+    img.parentElement.insertBefore(slider, img);
+    /*position the slider in the middle:*/
+    slider.style.top = (h / 2) - (slider.offsetHeight / 2) + "px";
+    slider.style.left = (w / 2) - (slider.offsetWidth / 2) + "px";
+    /*execute a function when the mouse button is pressed:*/
+    slider.addEventListener("mousedown", slideReady);
+    /*and another function when the mouse button is released:*/
+    window.addEventListener("mouseup", slideFinish);
+    /*or touched (for touch screens:*/
+    slider.addEventListener("touchstart", slideReady);
+    /*and released (for touch screens:*/
+    window.addEventListener("touchend", slideFinish);
+    function slideReady(e) {
+      /*prevent any other actions that may occur when moving over the image:*/
+      e.preventDefault();
+      /*the slider is now clicked and ready to move:*/
+      clicked = 1;
+      /*execute a function when the slider is moved:*/
+      window.addEventListener("mousemove", slideMove);
+      window.addEventListener("touchmove", slideMove);
+    }
+    function slideFinish() {
+      /*the slider is no longer clicked:*/
+      clicked = 0;
+    }
+    function slideMove(e) {
+      var pos;
+      /*if the slider is no longer clicked, exit this function:*/
+      if (clicked == 0) return false;
+      /*get the cursor's x position:*/
+      pos = getCursorPos(e)
+      /*prevent the slider from being positioned outside the image:*/
+      if (pos < 0) pos = 0;
+      if (pos > w) pos = w;
+      /*execute a function that will resize the overlay image according to the cursor:*/
+      slide(pos);
+    }
+    function getCursorPos(e) {
+      var a, x = 0;
+      e = (e.changedTouches) ? e.changedTouches[0] : e;
+      /*get the x positions of the image:*/
+      a = img.getBoundingClientRect();
+      /*calculate the cursor's x coordinate, relative to the image:*/
+      x = e.pageX - a.left;
+      /*consider any page scrolling:*/
+      x = x - window.pageXOffset;
+      return x;
+    }
+    function slide(x) {
+      /*resize the image:*/
+      img.style.width = x + "px";
+      /*position the slider:*/
+      slider.style.left = img.offsetWidth - (slider.offsetWidth / 2) + "px";
+    }
+  }
+}
+</script>
+</head>
+<body>
+<h1>Compare Two Images</h1>
+<p>Click and slide the blue slider to compare two images:</p>
+<div class="img-comp-container">
+  <div class="img-comp-img">
+    <img src="img_snow.jpg" width="300" height="200">
+  </div>
+  <div class="img-comp-img img-comp-overlay">
+    <img src="img_forest.jpg" width="300" height="200">
+  </div>
+</div>
+<script>
+/*Execute a function that will execute an image compare function for each element with the img-comp-overlay class:*/
+initComparisons();
+</script>
+</body>
+</html>

tools/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .inference import Inference
+from .inference import Matting
+from .util import log

tools/inference.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import torch
+from .model import PHNet
+import torchvision.transforms.functional as tf
+from .util import inference_img, log
+from .stylematte import StyleMatte
+import numpy as np
+class Inference:
+    def __init__(self, **kwargs):
+        self.rank = 0
+        self.__dict__.update(kwargs)
+        self.model = PHNet(enc_sizes=self.enc_sizes,
+                           skips=self.skips,
+                           grid_count=self.grid_counts,
+                           init_weights=self.init_weights,
+                           init_value=self.init_value)
+        log(f"checkpoint: {self.checkpoint.harmonizer}")
+        state = torch.load(self.checkpoint.harmonizer,
+                           map_location=self.device)
+        self.model.load_state_dict(state, strict=True)
+        self.model.eval()
+    def harmonize(self, composite, mask):
+        if len(composite.shape) < 4:
+            composite = composite.unsqueeze(0)
+        while len(mask.shape) < 4:
+            mask = mask.unsqueeze(0)
+        composite = tf.resize(composite, [self.image_size, self.image_size])
+        mask = tf.resize(mask, [self.image_size, self.image_size])
+        log(composite.shape, mask.shape)
+        with torch.no_grad():
+            harmonized = self.model(composite, mask)['harmonized']
+        result = harmonized * mask + composite * (1-mask)
+        print(result.shape)
+        return result
+class Matting:
+    def __init__(self, **kwargs):
+        self.rank = 0
+        self.__dict__.update(kwargs)
+        self.model = StyleMatte().to(self.device)
+        log(f"checkpoint: {self.checkpoint.matting}")
+        state = torch.load(self.checkpoint.matting, map_location=self.device)
+        self.model.load_state_dict(state, strict=True)
+        self.model.eval()
+    def extract(self, inp):
+        mask = inference_img(self.model, inp, self.device)
+        inp_np = np.array(inp)
+        fg = mask[:, :, None]*inp_np
+        return [mask, fg]

tools/model.py ADDED Viewed

	@@ -0,0 +1,296 @@

+from matplotlib import pyplot as plt
+# from shtools import shReconstructSignal
+from torchvision import transforms, utils
+# from torchvision.ops import SqueezeExcitation
+from torch.utils.data import Dataset
+import torch.nn.functional as F
+import torch.nn as nn
+import torch
+import math
+import cv2
+import numpy as np
+from .normalizer import PatchNormalizer, PatchedHarmonizer
+from .util import rgb_to_lab, lab_to_rgb, lab_shift
+# from shtools import *
+# from color_converters import luv_to_rgb, rgb_to_luv
+# from skimage import io, transform
+'''
+    Input (256,512,3)
+'''
+def inpaint_bg(comp, mask, dim=[2, 3]):
+    """
+    inpaint bg for ihd
+    Args:
+        comp (torch.float): [0:1]
+        mask (torch.float): [0:1]
+    """
+    back = comp * (1-mask)  # *255
+    sum = torch.sum(back, dim=dim)     # (B, C)
+    num = torch.sum((1-mask), dim=dim)       # (B, C)
+    mu = sum / (num)
+    mean = mu[:, :, None, None]
+    back = back + mask * mean
+    return back
+class ConvTransposeUp(nn.Sequential):
+    def __init__(self, in_channels, out_channels, kernel_size=4, padding=1, stride=2, activation=None):
+        super().__init__(
+            nn.ConvTranspose2d(in_channels, out_channels,
+                               kernel_size=kernel_size, padding=padding, stride=stride),
+            activation() if activation is not None else nn.Identity(),
+        )
+class UpsampleShuffle(nn.Sequential):
+    def __init__(self, in_channels, out_channels, activation=True):
+        super().__init__(
+            nn.Conv2d(in_channels, out_channels * 4, kernel_size=1),
+            nn.GELU() if activation else nn.Identity(),
+            nn.PixelShuffle(2)
+        )
+    def reset_parameters(self):
+        init_subpixel(self[0].weight)
+        nn.init.zeros_(self[0].bias)
+class UpsampleResize(nn.Sequential):
+    def __init__(self, in_channels, out_channels, out_size=None, activation=None, scale_factor=2., mode='bilinear'):
+        super().__init__(
+            nn.Upsample(scale_factor=scale_factor, mode=mode) if out_size is None else nn.Upsample(
+                out_size, mode=mode),
+            nn.ReflectionPad2d(1),
+            nn.Conv2d(in_channels, out_channels,
+                      kernel_size=3, stride=1, padding=0),
+            activation() if activation is not None else nn.Identity(),
+        )
+def conv_bn(in_, out_, kernel_size=3, stride=1, padding=1, activation=nn.ReLU, normalization=nn.InstanceNorm2d):
+    return nn.Sequential(
+        nn.Conv2d(in_, out_, kernel_size, stride=stride, padding=padding),
+        normalization(out_) if normalization is not None else nn.Identity(),
+        activation(),
+    )
+def init_subpixel(weight):
+    co, ci, h, w = weight.shape
+    co2 = co // 4
+    # initialize sub kernel
+    k = torch.empty([c02, ci, h, w])
+    nn.init.kaiming_uniform_(k)
+    # repeat 4 times
+    k = k.repeat_interleave(4, dim=0)
+    weight.data.copy_(k)
+class DownsampleShuffle(nn.Sequential):
+    def __init__(self, in_channels):
+        assert in_channels % 4 == 0
+        super().__init__(
+            nn.Conv2d(in_channels, in_channels // 4, kernel_size=1),
+            nn.ReLU(),
+            nn.PixelUnshuffle(2)
+        )
+    def reset_parameters(self):
+        init_subpixel(self[0].weight)
+        nn.init.zeros_(self[0].bias)
+def conv_bn_elu(in_, out_, kernel_size=3, stride=1, padding=True):
+    # conv layer with ELU activation function
+    pad = int(kernel_size/2)
+    if padding is False:
+        pad = 0
+    return nn.Sequential(
+        nn.Conv2d(in_, out_, kernel_size, stride=stride, padding=pad),
+        nn.ELU(),
+    )
+class Inference_Data(Dataset):
+    def __init__(self, img_path):
+        self.input_img = cv2.imread(img_path, cv2.IMREAD_UNCHANGED)
+        self.input_img = cv2.resize(
+            self.input_img, (512, 256), interpolation=cv2.INTER_CUBIC)
+        self.to_tensor = transforms.ToTensor()
+        self.data_len = 1
+    def __getitem__(self, index):
+        self.tensor_img = self.to_tensor(self.input_img)
+        return self.tensor_img
+    def __len__(self):
+        return self.data_len
+class SEBlock(nn.Module):
+    def __init__(self, channel, reducation=8):
+        super(SEBlock, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(
+            nn.Linear(channel, channel//reducation),
+            nn.ReLU(inplace=True),
+            nn.Linear(channel//reducation, channel),
+            nn.Sigmoid())
+    def forward(self, x, aux_inp=None):
+        b, c, w, h = x.size()
+        def scale(x):
+            return (x - x.min()) / (x.max() - x.min() + 1e-8)
+        y1 = self.avg_pool(x).view(b, c)
+        y = self.fc(y1).view(b, c, 1, 1)
+        r = x*y
+        if aux_inp is not None:
+            aux_weitghts = nn.AdaptiveAvgPool2d(aux_inp.shape[-1]//8)(aux_inp)
+            aux_weitghts = nn.Sigmoid()(aux_weitghts.mean(1, keepdim=True))
+            tmp = x*aux_weitghts
+            tmp_img = (tmp - tmp.min()) / (tmp.max() - tmp.min())
+            r += tmp
+        return r
+class ConvTransposeUp(nn.Sequential):
+    def __init__(self, in_channels, out_channels, norm, kernel_size=3, stride=2, padding=1, activation=None):
+        super().__init__(
+            nn.ConvTranspose2d(in_channels, out_channels,
+                               # output_padding=output_padding, dilation=dilation
+                               kernel_size=kernel_size, padding=padding, stride=stride,
+                               ),
+            norm(out_channels) if norm is not None else nn.Identity(),
+            activation() if activation is not None else nn.Identity(),
+        )
+class SkipConnect(nn.Module):
+    """docstring for RegionalSkipConnect"""
+    def __init__(self, channel):
+        super(SkipConnect, self).__init__()
+        self.rconv = nn.Conv2d(channel*2, channel, 3, padding=1, bias=False)
+    def forward(self, feature):
+        return F.relu(self.rconv(feature))
+class AttentionBlock(nn.Module):
+    def __init__(self, in_channels):
+        super(AttentionBlock, self).__init__()
+        self.attn = nn.Sequential(
+            nn.Conv2d(in_channels * 2, in_channels * 2, kernel_size=1),
+            nn.Sigmoid()
+        )
+    def forward(self, x):
+        return self.attn(x)
+class PatchHarmonizerBlock(nn.Module):
+    def __init__(self, in_channels=3, grid_count=5):
+        super(PatchHarmonizerBlock, self).__init__()
+        self.patch_harmonizer = PatchedHarmonizer(grid_count=grid_count)
+        self.head = conv_bn(in_channels*2, in_channels,
+                            kernel_size=3, padding=1, normalization=None)
+    def forward(self, fg, bg, mask):
+        fg_harm, _ = self.patch_harmonizer(fg, bg, mask)
+        return self.head(torch.cat([fg, fg_harm], 1))
+class PHNet(nn.Module):
+    def __init__(self, enc_sizes=[3, 16, 32, 64, 128, 256, 512], skips=True, grid_count=[10, 5, 1], init_weights=[0.5, 0.5], init_value=0.8):
+        super(PHNet, self).__init__()
+        self.skips = skips
+        self.feature_extractor = PatchHarmonizerBlock(
+            in_channels=enc_sizes[0], grid_count=grid_count[1])
+        self.encoder = nn.ModuleList([
+            conv_bn(enc_sizes[0], enc_sizes[1],
+                    kernel_size=4, stride=2),
+            conv_bn(enc_sizes[1], enc_sizes[2],
+                    kernel_size=3, stride=1),
+            conv_bn(enc_sizes[2], enc_sizes[3],
+                    kernel_size=4, stride=2),
+            conv_bn(enc_sizes[3], enc_sizes[4],
+                    kernel_size=3, stride=1),
+            conv_bn(enc_sizes[4], enc_sizes[5],
+                    kernel_size=4, stride=2),
+            conv_bn(enc_sizes[5], enc_sizes[6],
+                    kernel_size=3, stride=1),
+        ])
+        dec_ins = enc_sizes[::-1]
+        dec_sizes = enc_sizes[::-1]
+        self.start_level = len(dec_sizes) - len(grid_count)
+        self.normalizers = nn.ModuleList([
+            PatchNormalizer(in_channels=dec_sizes[self.start_level+i], grid_count=count, weights=init_weights,  eps=1e-7, init_value=init_value) for i, count in enumerate(grid_count)
+        ])
+        self.decoder = nn.ModuleList([
+            ConvTransposeUp(
+                dec_ins[0], dec_sizes[1], norm=nn.BatchNorm2d, kernel_size=3, stride=1, activation=nn.LeakyReLU),
+            ConvTransposeUp(
+                dec_ins[1], dec_sizes[2], norm=nn.BatchNorm2d, kernel_size=4, stride=2, activation=nn.LeakyReLU),
+            ConvTransposeUp(
+                dec_ins[2], dec_sizes[3], norm=nn.BatchNorm2d, kernel_size=3, stride=1, activation=nn.LeakyReLU),
+            ConvTransposeUp(
+                dec_ins[3], dec_sizes[4], norm=None, kernel_size=4, stride=2, activation=nn.LeakyReLU),
+            ConvTransposeUp(
+                dec_ins[4], dec_sizes[5], norm=None, kernel_size=3, stride=1, activation=nn.LeakyReLU),
+            ConvTransposeUp(
+                dec_ins[5], 3,  norm=None, kernel_size=4, stride=2, activation=None),
+        ])
+        self.skip = nn.ModuleList([
+            SkipConnect(x) for x in dec_ins
+        ])
+        self.SE_block = SEBlock(enc_sizes[6])
+    def forward(self, img, mask):
+        x = img
+        enc_outs = [x]
+        x_harm = self.feature_extractor(x*mask, x*(1-mask), mask)
+        # x = x_harm
+        masks = [mask]
+        for i, down_layer in enumerate(self.encoder):
+            x = down_layer(x)
+            scale_factor = 1. / (pow(2, 1 - i % 2))
+            masks.append(F.interpolate(masks[-1], scale_factor=scale_factor))
+            enc_outs.append(x)
+        x = self.SE_block(x, aux_inp=x_harm)
+        masks = masks[::-1]
+        for i, (up_layer, enc_out) in enumerate(zip(self.decoder, enc_outs[::-1])):
+            if i >= self.start_level:
+                enc_out = self.normalizers[i -
+                                           self.start_level](enc_out, enc_out, masks[i])
+            x = torch.cat([x, enc_out], 1)
+            x = self.skip[i](x)
+            x = up_layer(x)
+        relighted = F.sigmoid(x)
+        return {
+            "harmonized": relighted,  # target prediction
+        }
+    def set_requires_grad(self, modules=["encoder", "sh_head", "resquare", "decoder"], value=False):
+        for module in modules:
+            attr = getattr(self, module, None)
+            if attr is not None:
+                attr.requires_grad_(value)

tools/normalizer.py ADDED Viewed

	@@ -0,0 +1,261 @@

+import numpy as np
+import cv2
+import os
+import tqdm
+import time
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .util import rgb_to_lab, lab_to_rgb
+def blend(f, b, a):
+    return f*a + b*(1 - a)
+class PatchedHarmonizer(nn.Module):
+    def __init__(self, grid_count=1, init_weights=[0.9, 0.1]):
+        super(PatchedHarmonizer, self).__init__()
+        self.eps = 1e-8
+        # self.weights = torch.nn.Parameter(torch.ones((grid_count, grid_count)), requires_grad=True)
+        # self.grid_weights_ = torch.nn.Parameter(torch.FloatTensor(init_weights), requires_grad=True)
+        self.grid_weights = torch.nn.Parameter(
+            torch.FloatTensor(init_weights), requires_grad=True)
+        # self.weights.retain_graph = True
+        self.grid_count = grid_count
+    def lab_shift(self, x, invert=False):
+        x = x.float()
+        if invert:
+            x[:, 0, :, :] /= 2.55
+            x[:, 1, :, :] -= 128
+            x[:, 2, :, :] -= 128
+        else:
+            x[:, 0, :, :] *= 2.55
+            x[:, 1, :, :] += 128
+            x[:, 2, :, :] += 128
+        return x
+    def get_mean_std(self, img, mask, dim=[2, 3]):
+        sum = torch.sum(img*mask, dim=dim)     # (B, C)
+        num = torch.sum(mask, dim=dim)       # (B, C)
+        mu = sum / (num + self.eps)
+        mean = mu[:, :, None, None]
+        var = torch.sum(((img - mean)*mask) ** 2, dim=dim) / (num + self.eps)
+        var = var[:, :, None, None]
+        return mean, torch.sqrt(var+self.eps)
+    def compute_patch_statistics(self, lab):
+        means, stds = [], []
+        bs, dx, dy = lab.shape[0], lab.shape[2] // self.grid_count, lab.shape[3] // self.grid_count
+        for h in range(self.grid_count):
+            cmeans, cstds = [], []
+            for w in range(self.grid_count):
+                ind = [h*dx, (h+1)*dx, w*dy, (w+1)*dy]
+                if h == self.grid_count - 1:
+                    ind[1] = None
+                if w == self.grid_count - 1:
+                    ind[-1] = None
+                m, v = self.compute_mean_var(
+                    lab[:, :, ind[0]:ind[1], ind[2]:ind[3]], dim=[2, 3])
+                cmeans.append(m)
+                cstds.append(v)
+            means.append(cmeans)
+            stds.append(cstds)
+        return means, stds
+    def compute_mean_var(self, x, dim=[1, 2]):
+        mean = x.float().mean(dim=dim)[:, :, None, None]
+        var = torch.sqrt(x.float().var(dim=dim))[:, :, None, None]
+        return mean, var
+    def forward(self, fg_rgb, bg_rgb, alpha, masked_stats=False):
+        bg_rgb = F.interpolate(bg_rgb, size=(
+            fg_rgb.shape[2:]))  # b x C x H x W
+        bg_lab = bg_rgb  # self.lab_shift(rgb_to_lab(bg_rgb/255.))
+        fg_lab = fg_rgb  # self.lab_shift(rgb_to_lab(fg_rgb/255.))
+        if masked_stats:
+            self.bg_global_mean, self.bg_global_var = self.get_mean_std(
+                img=bg_lab, mask=(1-alpha))
+            self.fg_global_mean, self.fg_global_var = self.get_mean_std(
+                img=fg_lab, mask=torch.ones_like(alpha))
+        else:
+            self.bg_global_mean, self.bg_global_var = self.compute_mean_var(bg_lab, dim=[
+                                                                            2, 3])
+            self.fg_global_mean, self.fg_global_var = self.compute_mean_var(fg_lab, dim=[
+                                                                            2, 3])
+        self.bg_means, self.bg_vars = self.compute_patch_statistics(
+            bg_lab)
+        self.fg_means, self.fg_vars = self.compute_patch_statistics(
+            fg_lab)
+        fg_harm = self.harmonize(fg_lab)
+        # fg_harm = lab_to_rgb(fg_harm)
+        bg = F.interpolate(bg_rgb, size=(fg_rgb.shape[2:]))/255.
+        composite = blend(fg_harm, bg, alpha)
+        return composite, fg_harm
+    def harmonize(self, fg):
+        harmonized = torch.zeros_like(fg)
+        dx = fg.shape[2] // self.grid_count
+        dy = fg.shape[3] // self.grid_count
+        for h in range(self.grid_count):
+            for w in range(self.grid_count):
+                ind = [h*dx, (h+1)*dx, w*dy, (w+1)*dy]
+                if h == self.grid_count - 1:
+                    ind[1] = None
+                if w == self.grid_count - 1:
+                    ind[-1] = None
+                harmonized[:, :, ind[0]:ind[1], ind[2]:ind[3]] = self.normalize_channel(
+                    fg[:, :, ind[0]:ind[1], ind[2]:ind[3]], h, w)
+        # harmonized = self.lab_shift(harmonized, invert=True)
+        return harmonized
+    def normalize_channel(self, value, h, w):
+        fg_local_mean, fg_local_var = self.fg_means[h][w], self.fg_vars[h][w]
+        bg_local_mean, bg_local_var = self.bg_means[h][w], self.bg_vars[h][w]
+        fg_global_mean, fg_global_var = self.fg_global_mean, self.fg_global_var
+        bg_global_mean, bg_global_var = self.bg_global_mean, self.bg_global_var
+        # global2global normalization
+        zeroed_mean = value - fg_global_mean
+        # (fg_v * div_global_v +  (1-fg_v) * div_v)
+        scaled_var = zeroed_mean * (bg_global_var/(fg_global_var + self.eps))
+        normalized_global = scaled_var + bg_global_mean
+        # local2local normalization
+        zeroed_mean = value - fg_local_mean
+        # (fg_v * div_global_v +  (1-fg_v) * div_v)
+        scaled_var = zeroed_mean * (bg_local_var/(fg_local_var + self.eps))
+        normalized_local = scaled_var + bg_local_mean
+        return self.grid_weights[0]*normalized_local + self.grid_weights[1]*normalized_global
+    def normalize_fg(self, value):
+        zeroed_mean = value - \
+            (self.fg_local_mean *
+             self.grid_weights[None, None, :, :, None, None]).sum().squeeze()
+        # (fg_v * div_global_v +  (1-fg_v) * div_v)
+        scaled_var = zeroed_mean * \
+            (self.bg_global_var/(self.fg_global_var + self.eps))
+        normalized_lg = scaled_var + \
+            (self.bg_local_mean *
+             self.grid_weights[None, None, :, :, None, None]).sum().squeeze()
+        return normalized_lg
+class PatchNormalizer(nn.Module):
+    def __init__(self, in_channels=3, eps=1e-7, grid_count=1, weights=[0.5, 0.5], init_value=1e-2):
+        super(PatchNormalizer, self).__init__()
+        self.grid_count = grid_count
+        self.eps = eps
+        self.weights = nn.Parameter(
+            torch.FloatTensor(weights), requires_grad=True)
+        self.fg_var = nn.Parameter(
+            init_value * torch.ones(in_channels)[None, :, None, None], requires_grad=True)
+        self.fg_bias = nn.Parameter(
+            init_value * torch.zeros(in_channels)[None, :, None, None], requires_grad=True)
+        self.patched_fg_var = nn.Parameter(
+            init_value * torch.ones(in_channels)[None, :, None, None], requires_grad=True)
+        self.patched_fg_bias = nn.Parameter(
+            init_value * torch.zeros(in_channels)[None, :, None, None], requires_grad=True)
+        self.bg_var = nn.Parameter(
+            init_value * torch.ones(in_channels)[None, :, None, None], requires_grad=True)
+        self.bg_bias = nn.Parameter(
+            init_value * torch.zeros(in_channels)[None, :, None, None], requires_grad=True)
+        self.grid_weights = torch.nn.Parameter(torch.ones((in_channels, grid_count, grid_count))[
+                                               None, :, :, :] / (grid_count*grid_count*in_channels), requires_grad=True)
+    def local_normalization(self, value):
+        zeroed_mean = value - \
+            (self.fg_local_mean *
+             self.grid_weights[None, None, :, :, None, None]).sum().squeeze()
+        # (fg_v * div_global_v +  (1-fg_v) * div_v)
+        scaled_var = zeroed_mean * \
+            (self.bg_global_var/(self.fg_global_var + self.eps))
+        normalized_lg = scaled_var + \
+            (self.bg_local_mean *
+             self.grid_weights[None, None, :, :, None, None]).sum().squeeze()
+        return normalized_lg
+    def get_mean_std(self, img, mask, dim=[2, 3]):
+        sum = torch.sum(img*mask, dim=dim)     # (B, C)
+        num = torch.sum(mask, dim=dim)       # (B, C)
+        mu = sum / (num + self.eps)
+        mean = mu[:, :, None, None]
+        var = torch.sum(((img - mean)*mask) ** 2, dim=dim) / (num + self.eps)
+        var = var[:, :, None, None]
+        return mean, torch.sqrt(var+self.eps)
+    def compute_patch_statistics(self, img, mask):
+        means, stds = [], []
+        bs, dx, dy = img.shape[0], img.shape[2] // self.grid_count, img.shape[3] // self.grid_count
+        for h in range(self.grid_count):
+            cmeans, cstds = [], []
+            for w in range(self.grid_count):
+                ind = [h*dx, (h+1)*dx, w*dy, (w+1)*dy]
+                if h == self.grid_count - 1:
+                    ind[1] = None
+                if w == self.grid_count - 1:
+                    ind[-1] = None
+                m, v = self.get_mean_std(
+                    img[:, :, ind[0]:ind[1], ind[2]:ind[3]], mask[:, :, ind[0]:ind[1], ind[2]:ind[3]], dim=[2, 3])
+                cmeans.append(m.reshape(m.shape[:2]))
+                cstds.append(v.reshape(v.shape[:2]))
+            means.append(torch.stack(cmeans))
+            stds.append(torch.stack(cstds))
+        return torch.stack(means), torch.stack(stds)
+    def compute_mean_var(self, x, dim=[2, 3]):
+        mean = x.float().mean(dim=dim)
+        var = torch.sqrt(x.float().var(dim=dim))
+        return mean, var
+    def forward(self, fg, bg, mask):
+        self.local_means, self.local_vars = self.compute_patch_statistics(
+            bg, (1-mask))
+        bg_mean, bg_var = self.get_mean_std(bg, 1 - mask)
+        zeroed_mean = (bg - bg_mean)
+        unscaled = zeroed_mean / bg_var
+        bg_normalized = unscaled * self.bg_var + self.bg_bias
+        fg_mean, fg_var = self.get_mean_std(fg, mask)
+        zeroed_mean = fg - fg_mean
+        unscaled = zeroed_mean / fg_var
+        mean_patched_back = (self.local_means.permute(
+            2, 3, 0, 1)*self.grid_weights).sum(dim=[2, 3])[:, :, None, None]
+        normalized = unscaled * bg_var + bg_mean
+        patch_normalized = unscaled * bg_var + mean_patched_back
+        fg_normalized = normalized * self.fg_var + self.fg_bias
+        fg_patch_normalized = patch_normalized * \
+            self.patched_fg_var + self.patched_fg_bias
+        fg_result = self.weights[0] * fg_normalized + \
+            self.weights[1] * fg_patch_normalized
+        composite = blend(fg_result, bg_normalized, mask)
+        return composite

tools/stylematte.py ADDED Viewed

	@@ -0,0 +1,506 @@

+import cv2
+import random
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from typing import List
+from itertools import chain
+from transformers import SegformerForSemanticSegmentation, Mask2FormerForUniversalSegmentation
+device = 'cpu'
+class EncoderDecoder(nn.Module):
+    def __init__(
+        self,
+        encoder,
+        decoder,
+        prefix=nn.Conv2d(3, 3, kernel_size=3, padding=1, bias=True),
+    ):
+        super().__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+        self.prefix = prefix
+    def forward(self, x):
+        if self.prefix is not None:
+            x = self.prefix(x)
+        x = self.encoder(x)["hidden_states"]  # transformers
+        return self.decoder(x)
+def conv2d_relu(input_filters, output_filters, kernel_size=3,  bias=True):
+    return nn.Sequential(
+        nn.Conv2d(input_filters, output_filters,
+                  kernel_size=kernel_size, padding=kernel_size//2, bias=bias),
+        nn.LeakyReLU(0.2, inplace=True),
+        nn.BatchNorm2d(output_filters)
+    )
+def up_and_add(x, y):
+    return F.interpolate(x, size=(y.size(2), y.size(3)), mode='bilinear', align_corners=True) + y
+class FPN_fuse(nn.Module):
+    def __init__(self, feature_channels=[256, 512, 1024, 2048], fpn_out=256):
+        super(FPN_fuse, self).__init__()
+        assert feature_channels[0] == fpn_out
+        self.conv1x1 = nn.ModuleList([nn.Conv2d(ft_size, fpn_out, kernel_size=1)
+                                      for ft_size in feature_channels[1:]])
+        self.smooth_conv = nn.ModuleList([nn.Conv2d(fpn_out, fpn_out, kernel_size=3, padding=1)]
+                                         * (len(feature_channels)-1))
+        self.conv_fusion = nn.Sequential(
+            nn.Conv2d(2*fpn_out, fpn_out, kernel_size=3,
+                      padding=1, bias=False),
+            nn.BatchNorm2d(fpn_out),
+            nn.ReLU(inplace=True),
+        )
+    def forward(self, features):
+        features[:-1] = [conv1x1(feature) for feature,
+                         conv1x1 in zip(features[:-1], self.conv1x1)]
+        feature = up_and_add(self.smooth_conv[0](features[0]), features[1])
+        feature = up_and_add(self.smooth_conv[1](feature), features[2])
+        feature = up_and_add(self.smooth_conv[2](feature), features[3])
+        H, W = features[-1].size(2), features[-1].size(3)
+        x = [feature, features[-1]]
+        x = [F.interpolate(x_el, size=(H, W), mode='bilinear',
+                           align_corners=True) for x_el in x]
+        x = self.conv_fusion(torch.cat(x, dim=1))
+        # x = F.interpolate(x, size=(H*4, W*4), mode='bilinear', align_corners=True)
+        return x
+class PSPModule(nn.Module):
+    # In the original inmplementation they use precise RoI pooling
+    # Instead of using adaptative average pooling
+    def __init__(self, in_channels, bin_sizes=[1, 2, 4, 6]):
+        super(PSPModule, self).__init__()
+        out_channels = in_channels // len(bin_sizes)
+        self.stages = nn.ModuleList([self._make_stages(in_channels, out_channels, b_s)
+                                     for b_s in bin_sizes])
+        self.bottleneck = nn.Sequential(
+            nn.Conv2d(in_channels+(out_channels * len(bin_sizes)), in_channels,
+                      kernel_size=3, padding=1, bias=False),
+            nn.BatchNorm2d(in_channels),
+            nn.ReLU(inplace=True),
+            nn.Dropout2d(0.1)
+        )
+    def _make_stages(self, in_channels, out_channels, bin_sz):
+        prior = nn.AdaptiveAvgPool2d(output_size=bin_sz)
+        conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
+        bn = nn.BatchNorm2d(out_channels)
+        relu = nn.ReLU(inplace=True)
+        return nn.Sequential(prior, conv, bn, relu)
+    def forward(self, features):
+        h, w = features.size()[2], features.size()[3]
+        pyramids = [features]
+        pyramids.extend([F.interpolate(stage(features), size=(h, w), mode='bilinear',
+                                       align_corners=True) for stage in self.stages])
+        output = self.bottleneck(torch.cat(pyramids, dim=1))
+        return output
+class UperNet_swin(nn.Module):
+    # Implementing only the object path
+    def __init__(self, backbone, pretrained=True):
+        super(UperNet_swin, self).__init__()
+        self.backbone = backbone
+        feature_channels = [192, 384, 768, 768]
+        self.PPN = PSPModule(feature_channels[-1])
+        self.FPN = FPN_fuse(feature_channels, fpn_out=feature_channels[0])
+        self.head = nn.Conv2d(feature_channels[0], 1, kernel_size=3, padding=1)
+    def forward(self, x):
+        input_size = (x.size()[2], x.size()[3])
+        features = self.backbone(x)["hidden_states"]
+        features[-1] = self.PPN(features[-1])
+        x = self.head(self.FPN(features))
+        x = F.interpolate(x, size=input_size, mode='bilinear')
+        return x
+    def get_backbone_params(self):
+        return self.backbone.parameters()
+    def get_decoder_params(self):
+        return chain(self.PPN.parameters(), self.FPN.parameters(), self.head.parameters())
+class UnetDecoder(nn.Module):
+    def __init__(
+        self,
+        encoder_channels=(3, 192, 384, 768, 768),
+        decoder_channels=(512, 256, 128, 64),
+        n_blocks=4,
+        use_batchnorm=True,
+        attention_type=None,
+        center=False,
+    ):
+        super().__init__()
+        if n_blocks != len(decoder_channels):
+            raise ValueError(
+                "Model depth is {}, but you provide `decoder_channels` for {} blocks.".format(
+                    n_blocks, len(decoder_channels)
+                )
+            )
+        # remove first skip with same spatial resolution
+        encoder_channels = encoder_channels[1:]
+        # reverse channels to start from head of encoder
+        encoder_channels = encoder_channels[::-1]
+        # computing blocks input and output channels
+        head_channels = encoder_channels[0]
+        in_channels = [head_channels] + list(decoder_channels[:-1])
+        skip_channels = list(encoder_channels[1:]) + [0]
+        out_channels = decoder_channels
+        if center:
+            self.center = CenterBlock(
+                head_channels, head_channels, use_batchnorm=use_batchnorm)
+        else:
+            self.center = nn.Identity()
+        # combine decoder keyword arguments
+        kwargs = dict(use_batchnorm=use_batchnorm,
+                      attention_type=attention_type)
+        blocks = [
+            DecoderBlock(in_ch, skip_ch, out_ch, **kwargs)
+            for in_ch, skip_ch, out_ch in zip(in_channels, skip_channels, out_channels)
+        ]
+        self.blocks = nn.ModuleList(blocks)
+        upscale_factor = 4
+        self.matting_head = nn.Sequential(
+            nn.Conv2d(64, 1, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.UpsamplingBilinear2d(scale_factor=upscale_factor),
+        )
+    def preprocess_features(self, x):
+        features = []
+        for out_tensor in x:
+            bs, n, f = out_tensor.size()
+            h = int(n**0.5)
+            feature = out_tensor.view(-1, h, h,
+                                      f).permute(0, 3, 1, 2).contiguous()
+            features.append(feature)
+        return features
+    def forward(self, features):
+        # remove first skip with same spatial resolution
+        features = features[1:]
+        # reverse channels to start from head of encoder
+        features = features[::-1]
+        features = self.preprocess_features(features)
+        head = features[0]
+        skips = features[1:]
+        x = self.center(head)
+        for i, decoder_block in enumerate(self.blocks):
+            skip = skips[i] if i < len(skips) else None
+            x = decoder_block(x, skip)
+            # y_i = self.upsample1(y_i)
+        # hypercol = torch.cat([y0,y1,y2,y3,y4], dim=1)
+        x = self.matting_head(x)
+        x = 1-nn.ReLU()(1-x)
+        return x
+class SegmentationHead(nn.Sequential):
+    def __init__(self, in_channels, out_channels, kernel_size=3, upsampling=1):
+        conv2d = nn.Conv2d(in_channels, out_channels,
+                           kernel_size=kernel_size, padding=kernel_size // 2)
+        upsampling = nn.UpsamplingBilinear2d(
+            scale_factor=upsampling) if upsampling > 1 else nn.Identity()
+        super().__init__(conv2d, upsampling)
+class DecoderBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        skip_channels,
+        out_channels,
+        use_batchnorm=True,
+        attention_type=None,
+    ):
+        super().__init__()
+        self.conv1 = conv2d_relu(
+            in_channels + skip_channels,
+            out_channels,
+            kernel_size=3
+        )
+        self.conv2 = conv2d_relu(
+            out_channels,
+            out_channels,
+            kernel_size=3,
+        )
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.skip_channels = skip_channels
+    def forward(self, x, skip=None):
+        if skip is None:
+            x = F.interpolate(x, scale_factor=2, mode="nearest")
+        else:
+            if x.shape[-1] != skip.shape[-1]:
+                x = F.interpolate(x, scale_factor=2, mode="nearest")
+        if skip is not None:
+            # print(x.shape,skip.shape)
+            x = torch.cat([x, skip], dim=1)
+        x = self.conv1(x)
+        x = self.conv2(x)
+        return x
+class CenterBlock(nn.Sequential):
+    def __init__(self, in_channels, out_channels):
+        conv1 = conv2d_relu(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+        )
+        conv2 = conv2d_relu(
+            out_channels,
+            out_channels,
+            kernel_size=3,
+        )
+        super().__init__(conv1, conv2)
+class SegForm(nn.Module):
+    def __init__(self):
+        super(SegForm, self).__init__()
+#         configuration = SegformerConfig.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
+#         configuration.num_labels = 1 ## set output as 1
+#         self.model = SegformerForSemanticSegmentation(config=configuration)
+        self.model = SegformerForSemanticSegmentation.from_pretrained("nvidia/mit-b0", num_labels=1, ignore_mismatched_sizes=True
+                                                                      )
+    def forward(self, image):
+        img_segs = self.model(image)
+        upsampled_logits = nn.functional.interpolate(img_segs.logits,
+                                                     scale_factor=4,
+                                                     mode='nearest',
+                                                     )
+        return upsampled_logits
+class StyleMatte(nn.Module):
+    def __init__(self):
+        super(StyleMatte, self).__init__()
+#         configuration = SegformerConfig.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
+#         configuration.num_labels = 1 ## set output as 1
+        self.fpn = FPN_fuse(feature_channels=[256, 256, 256, 256], fpn_out=256)
+        self.pixel_decoder = Mask2FormerForUniversalSegmentation.from_pretrained(
+            "facebook/mask2former-swin-tiny-coco-instance").base_model.pixel_level_module
+        self.fgf = FastGuidedFilter()
+        self.conv = nn.Conv2d(256, 1, kernel_size=3, padding=1)
+        # self.mean =  torch.Tensor([0.43216, 0.394666, 0.37645]).float().view(-1, 1, 1)
+        # self.register_buffer('image_net_mean', self.mean)
+        # self.std = torch.Tensor([0.22803, 0.22145, 0.216989]).float().view(-1, 1, 1)
+        # self.register_buffer('image_net_std', self.std)
+    def forward(self, image, normalize=False):
+        # if normalize:
+        #     image.sub_(self.get_buffer("image_net_mean")).div_(self.get_buffer("image_net_std"))
+        decoder_out = self.pixel_decoder(image)
+        decoder_states = list(decoder_out.decoder_hidden_states)
+        decoder_states.append(decoder_out.decoder_last_hidden_state)
+        out_pure = self.fpn(decoder_states)
+        image_lr = nn.functional.interpolate(image.mean(1, keepdim=True),
+                                             scale_factor=0.25,
+                                             mode='bicubic',
+                                             align_corners=True
+                                             )
+        out = self.conv(out_pure)
+        out = self.fgf(image_lr, out, image.mean(
+            1, keepdim=True))  # .clip(0,1)
+        # out = nn.Sigmoid()(out)
+        # out =  nn.functional.interpolate(out,
+        #                     scale_factor=4,
+        #                     mode='bicubic',
+        #                     align_corners=True
+        #                 )
+        return torch.sigmoid(out)
+    def get_training_params(self):
+        # +list(self.fgf.parameters())
+        return list(self.fpn.parameters())+list(self.conv.parameters())
+class GuidedFilter(nn.Module):
+    def __init__(self, r, eps=1e-8):
+        super(GuidedFilter, self).__init__()
+        self.r = r
+        self.eps = eps
+        self.boxfilter = BoxFilter(r)
+    def forward(self, x, y):
+        n_x, c_x, h_x, w_x = x.size()
+        n_y, c_y, h_y, w_y = y.size()
+        assert n_x == n_y
+        assert c_x == 1 or c_x == c_y
+        assert h_x == h_y and w_x == w_y
+        assert h_x > 2 * self.r + 1 and w_x > 2 * self.r + 1
+        # N
+        N = self.boxfilter((x.data.new().resize_((1, 1, h_x, w_x)).fill_(1.0)))
+        # mean_x
+        mean_x = self.boxfilter(x) / N
+        # mean_y
+        mean_y = self.boxfilter(y) / N
+        # cov_xy
+        cov_xy = self.boxfilter(x * y) / N - mean_x * mean_y
+        # var_x
+        var_x = self.boxfilter(x * x) / N - mean_x * mean_x
+        # A
+        A = cov_xy / (var_x + self.eps)
+        # b
+        b = mean_y - A * mean_x
+        # mean_A; mean_b
+        mean_A = self.boxfilter(A) / N
+        mean_b = self.boxfilter(b) / N
+        return mean_A * x + mean_b
+class FastGuidedFilter(nn.Module):
+    def __init__(self, r=1, eps=1e-8):
+        super(FastGuidedFilter, self).__init__()
+        self.r = r
+        self.eps = eps
+        self.boxfilter = BoxFilter(r)
+    def forward(self, lr_x, lr_y, hr_x):
+        n_lrx, c_lrx, h_lrx, w_lrx = lr_x.size()
+        n_lry, c_lry, h_lry, w_lry = lr_y.size()
+        n_hrx, c_hrx, h_hrx, w_hrx = hr_x.size()
+        assert n_lrx == n_lry and n_lry == n_hrx
+        assert c_lrx == c_hrx and (c_lrx == 1 or c_lrx == c_lry)
+        assert h_lrx == h_lry and w_lrx == w_lry
+        assert h_lrx > 2*self.r+1 and w_lrx > 2*self.r+1
+        # N
+        N = self.boxfilter(lr_x.new().resize_((1, 1, h_lrx, w_lrx)).fill_(1.0))
+        # mean_x
+        mean_x = self.boxfilter(lr_x) / N
+        # mean_y
+        mean_y = self.boxfilter(lr_y) / N
+        # cov_xy
+        cov_xy = self.boxfilter(lr_x * lr_y) / N - mean_x * mean_y
+        # var_x
+        var_x = self.boxfilter(lr_x * lr_x) / N - mean_x * mean_x
+        # A
+        A = cov_xy / (var_x + self.eps)
+        # b
+        b = mean_y - A * mean_x
+        # mean_A; mean_b
+        mean_A = F.interpolate(
+            A, (h_hrx, w_hrx), mode='bilinear', align_corners=True)
+        mean_b = F.interpolate(
+            b, (h_hrx, w_hrx), mode='bilinear', align_corners=True)
+        return mean_A*hr_x+mean_b
+class DeepGuidedFilterRefiner(nn.Module):
+    def __init__(self, hid_channels=16):
+        super().__init__()
+        self.box_filter = nn.Conv2d(
+            4, 4, kernel_size=3, padding=1, bias=False, groups=4)
+        self.box_filter.weight.data[...] = 1 / 9
+        self.conv = nn.Sequential(
+            nn.Conv2d(4 * 2 + hid_channels, hid_channels,
+                      kernel_size=1, bias=False),
+            nn.BatchNorm2d(hid_channels),
+            nn.ReLU(True),
+            nn.Conv2d(hid_channels, hid_channels, kernel_size=1, bias=False),
+            nn.BatchNorm2d(hid_channels),
+            nn.ReLU(True),
+            nn.Conv2d(hid_channels, 4, kernel_size=1, bias=True)
+        )
+    def forward(self, fine_src, base_src, base_fgr, base_pha, base_hid):
+        fine_x = torch.cat([fine_src, fine_src.mean(1, keepdim=True)], dim=1)
+        base_x = torch.cat([base_src, base_src.mean(1, keepdim=True)], dim=1)
+        base_y = torch.cat([base_fgr, base_pha], dim=1)
+        mean_x = self.box_filter(base_x)
+        mean_y = self.box_filter(base_y)
+        cov_xy = self.box_filter(base_x * base_y) - mean_x * mean_y
+        var_x = self.box_filter(base_x * base_x) - mean_x * mean_x
+        A = self.conv(torch.cat([cov_xy, var_x, base_hid], dim=1))
+        b = mean_y - A * mean_x
+        H, W = fine_src.shape[2:]
+        A = F.interpolate(A, (H, W), mode='bilinear', align_corners=False)
+        b = F.interpolate(b, (H, W), mode='bilinear', align_corners=False)
+        out = A * fine_x + b
+        fgr, pha = out.split([3, 1], dim=1)
+        return fgr, pha
+def diff_x(input, r):
+    assert input.dim() == 4
+    left = input[:, :,         r:2 * r + 1]
+    middle = input[:, :, 2 * r + 1:] - input[:, :, :-2 * r - 1]
+    right = input[:, :,        -1:] - input[:, :, -2 * r - 1: -r - 1]
+    output = torch.cat([left, middle, right], dim=2)
+    return output
+def diff_y(input, r):
+    assert input.dim() == 4
+    left = input[:, :, :,         r:2 * r + 1]
+    middle = input[:, :, :, 2 * r + 1:] - input[:, :, :, :-2 * r - 1]
+    right = input[:, :, :,        -1:] - input[:, :, :, -2 * r - 1: -r - 1]
+    output = torch.cat([left, middle, right], dim=3)
+    return output
+class BoxFilter(nn.Module):
+    def __init__(self, r):
+        super(BoxFilter, self).__init__()
+        self.r = r
+    def forward(self, x):
+        assert x.dim() == 4
+        return diff_y(diff_x(x.cumsum(dim=2), self.r).cumsum(dim=3), self.r)

tools/util.py ADDED Viewed

	@@ -0,0 +1,345 @@

+import math
+import numpy as np
+from typing import Tuple
+import torch
+import torch.nn as nn
+from torchvision.utils import make_grid
+import cv2
+from torchvision import transforms, models
+def log(msg, lvl='info'):
+    if lvl == 'info':
+        print(f"***********{msg}****************")
+    if lvl == 'error':
+        print(f"!!! Exception: {msg} !!!")
+def lab_shift(x, invert=False):
+    x = x.float()
+    if invert:
+        x[:, 0, :, :] /= 2.55
+        x[:, 1, :, :] -= 128
+        x[:, 2, :, :] -= 128
+    else:
+        x[:, 0, :, :] *= 2.55
+        x[:, 1, :, :] += 128
+        x[:, 2, :, :] += 128
+    return x
+def calculate_psnr(img1, img2):
+    # img1 and img2 have range [0, 255]
+    img1 = img1.astype(np.float64)
+    img2 = img2.astype(np.float64)
+    mse = np.mean((img1 - img2)**2)
+    if mse == 0:
+        return float('inf')
+    return 20 * math.log10(255.0 / math.sqrt(mse))
+def calculate_fpsnr(fmse):
+    return 10 * math.log10(255.0 / (fmse + 1e-8))
+def tensor2img(tensor, out_type=np.uint8, min_max=(0, 1), bit=8):
+    '''
+    Converts a torch Tensor into an image Numpy array
+    Input: 4D(B,(3/1),H,W), 3D(C,H,W), or 2D(H,W), any range, RGB channel order
+    Output: 3D(H,W,C) or 2D(H,W), [0,255], np.uint8 (default)
+    '''
+    norm = float(2**bit) - 1
+    # print('before', tensor[:,:,0].max(), tensor[:,:,0].min(), '\t', tensor[:,:,1].max(), tensor[:,:,1].min(), '\t', tensor[:,:,2].max(), tensor[:,:,2].min())
+    tensor = tensor.squeeze().float().cpu().clamp_(*min_max)  # clamp
+    # print('clamp ', tensor[:,:,0].max(), tensor[:,:,0].min(), '\t', tensor[:,:,1].max(), tensor[:,:,1].min(), '\t', tensor[:,:,2].max(), tensor[:,:,2].min())
+    tensor = (tensor - min_max[0]) / \
+        (min_max[1] - min_max[0])  # to range [0,1]
+    n_dim = tensor.dim()
+    if n_dim == 4:
+        n_img = len(tensor)
+        img_np = make_grid(tensor, nrow=int(
+            math.sqrt(n_img)), normalize=False).numpy()
+        img_np = np.transpose(img_np[[2, 1, 0], :, :], (1, 2, 0))  # HWC, BGR
+    elif n_dim == 3:
+        img_np = tensor.numpy()
+        img_np = np.transpose(img_np[[2, 1, 0], :, :], (1, 2, 0))  # HWC, BGR
+    elif n_dim == 2:
+        img_np = tensor.numpy()
+    else:
+        raise TypeError(
+            'Only support 4D, 3D and 2D tensor. But received with dimension: {:d}'.format(n_dim))
+    if out_type == np.uint8:
+        # Important. Unlike matlab, numpy.unit8() WILL NOT round by default.
+        img_np = (img_np * norm).round()
+    return img_np.astype(out_type)
+def rgb_to_lab(image: torch.Tensor) -> torch.Tensor:
+    r"""Convert a RGB image to Lab.
+    .. image:: _static/img/rgb_to_lab.png
+    The input RGB image is assumed to be in the range of :math:`[0, 1]`. Lab
+    color is computed using the D65 illuminant and Observer 2.
+    Args:
+        image: RGB Image to be converted to Lab with shape :math:`(*, 3, H, W)`.
+    Returns:
+        Lab version of the image with shape :math:`(*, 3, H, W)`.
+        The L channel values are in the range 0..100. a and b are in the range -128..127.
+    Example:
+        >>> input = torch.rand(2, 3, 4, 5)
+        >>> output = rgb_to_lab(input)  # 2x3x4x5
+    """
+    if not isinstance(image, torch.Tensor):
+        raise TypeError(f"Input type is not a torch.Tensor. Got {type(image)}")
+    if len(image.shape) < 3 or image.shape[-3] != 3:
+        raise ValueError(
+            f"Input size must have a shape of (*, 3, H, W). Got {image.shape}")
+    # Convert from sRGB to Linear RGB
+    lin_rgb = rgb_to_linear_rgb(image)
+    xyz_im: torch.Tensor = rgb_to_xyz(lin_rgb)
+    # normalize for D65 white point
+    xyz_ref_white = torch.tensor(
+        [0.95047, 1.0, 1.08883], device=xyz_im.device, dtype=xyz_im.dtype)[..., :, None, None]
+    xyz_normalized = torch.div(xyz_im, xyz_ref_white)
+    threshold = 0.008856
+    power = torch.pow(xyz_normalized.clamp(min=threshold), 1 / 3.0)
+    scale = 7.787 * xyz_normalized + 4.0 / 29.0
+    xyz_int = torch.where(xyz_normalized > threshold, power, scale)
+    x: torch.Tensor = xyz_int[..., 0, :, :]
+    y: torch.Tensor = xyz_int[..., 1, :, :]
+    z: torch.Tensor = xyz_int[..., 2, :, :]
+    L: torch.Tensor = (116.0 * y) - 16.0
+    a: torch.Tensor = 500.0 * (x - y)
+    _b: torch.Tensor = 200.0 * (y - z)
+    out: torch.Tensor = torch.stack([L, a, _b], dim=-3)
+    return out
+def lab_to_rgb(image: torch.Tensor, clip: bool = True) -> torch.Tensor:
+    r"""Convert a Lab image to RGB.
+    The L channel is assumed to be in the range of :math:`[0, 100]`.
+    a and b channels are in the range of :math:`[-128, 127]`.
+    Args:
+        image: Lab image to be converted to RGB with shape :math:`(*, 3, H, W)`.
+        clip: Whether to apply clipping to insure output RGB values in range :math:`[0, 1]`.
+    Returns:
+        Lab version of the image with shape :math:`(*, 3, H, W)`.
+        The output RGB image are in the range of :math:`[0, 1]`.
+    Example:
+        >>> input = torch.rand(2, 3, 4, 5)
+        >>> output = lab_to_rgb(input)  # 2x3x4x5
+    """
+    if not isinstance(image, torch.Tensor):
+        raise TypeError(f"Input type is not a torch.Tensor. Got {type(image)}")
+    if len(image.shape) < 3 or image.shape[-3] != 3:
+        raise ValueError(
+            f"Input size must have a shape of (*, 3, H, W). Got {image.shape}")
+    L: torch.Tensor = image[..., 0, :, :]
+    a: torch.Tensor = image[..., 1, :, :]
+    _b: torch.Tensor = image[..., 2, :, :]
+    fy = (L + 16.0) / 116.0
+    fx = (a / 500.0) + fy
+    fz = fy - (_b / 200.0)
+    # if color data out of range: Z < 0
+    fz = fz.clamp(min=0.0)
+    fxyz = torch.stack([fx, fy, fz], dim=-3)
+    # Convert from Lab to XYZ
+    power = torch.pow(fxyz, 3.0)
+    scale = (fxyz - 4.0 / 29.0) / 7.787
+    xyz = torch.where(fxyz > 0.2068966, power, scale)
+    # For D65 white point
+    xyz_ref_white = torch.tensor(
+        [0.95047, 1.0, 1.08883], device=xyz.device, dtype=xyz.dtype)[..., :, None, None]
+    xyz_im = xyz * xyz_ref_white
+    rgbs_im: torch.Tensor = xyz_to_rgb(xyz_im)
+    # https://github.com/richzhang/colorization-pytorch/blob/66a1cb2e5258f7c8f374f582acc8b1ef99c13c27/util/util.py#L107
+    #     rgbs_im = torch.where(rgbs_im < 0, torch.zeros_like(rgbs_im), rgbs_im)
+    # Convert from RGB Linear to sRGB
+    rgb_im = linear_rgb_to_rgb(rgbs_im)
+    # Clip to 0,1 https://www.w3.org/Graphics/Color/srgb
+    if clip:
+        rgb_im = torch.clamp(rgb_im, min=0.0, max=1.0)
+    return rgb_im
+def rgb_to_xyz(image: torch.Tensor) -> torch.Tensor:
+    r"""Convert a RGB image to XYZ.
+    .. image:: _static/img/rgb_to_xyz.png
+    Args:
+        image: RGB Image to be converted to XYZ with shape :math:`(*, 3, H, W)`.
+    Returns:
+         XYZ version of the image with shape :math:`(*, 3, H, W)`.
+    Example:
+        >>> input = torch.rand(2, 3, 4, 5)
+        >>> output = rgb_to_xyz(input)  # 2x3x4x5
+    """
+    if not isinstance(image, torch.Tensor):
+        raise TypeError(f"Input type is not a torch.Tensor. Got {type(image)}")
+    if len(image.shape) < 3 or image.shape[-3] != 3:
+        raise ValueError(
+            f"Input size must have a shape of (*, 3, H, W). Got {image.shape}")
+    r: torch.Tensor = image[..., 0, :, :]
+    g: torch.Tensor = image[..., 1, :, :]
+    b: torch.Tensor = image[..., 2, :, :]
+    x: torch.Tensor = 0.412453 * r + 0.357580 * g + 0.180423 * b
+    y: torch.Tensor = 0.212671 * r + 0.715160 * g + 0.072169 * b
+    z: torch.Tensor = 0.019334 * r + 0.119193 * g + 0.950227 * b
+    out: torch.Tensor = torch.stack([x, y, z], -3)
+    return out
+def xyz_to_rgb(image: torch.Tensor) -> torch.Tensor:
+    r"""Convert a XYZ image to RGB.
+    Args:
+        image: XYZ Image to be converted to RGB with shape :math:`(*, 3, H, W)`.
+    Returns:
+        RGB version of the image with shape :math:`(*, 3, H, W)`.
+    Example:
+        >>> input = torch.rand(2, 3, 4, 5)
+        >>> output = xyz_to_rgb(input)  # 2x3x4x5
+    """
+    if not isinstance(image, torch.Tensor):
+        raise TypeError(f"Input type is not a torch.Tensor. Got {type(image)}")
+    if len(image.shape) < 3 or image.shape[-3] != 3:
+        raise ValueError(
+            f"Input size must have a shape of (*, 3, H, W). Got {image.shape}")
+    x: torch.Tensor = image[..., 0, :, :]
+    y: torch.Tensor = image[..., 1, :, :]
+    z: torch.Tensor = image[..., 2, :, :]
+    r: torch.Tensor = 3.2404813432005266 * x + - \
+        1.5371515162713185 * y + -0.4985363261688878 * z
+    g: torch.Tensor = -0.9692549499965682 * x + \
+        1.8759900014898907 * y + 0.0415559265582928 * z
+    b: torch.Tensor = 0.0556466391351772 * x + - \
+        0.2040413383665112 * y + 1.0573110696453443 * z
+    out: torch.Tensor = torch.stack([r, g, b], dim=-3)
+    return out
+def rgb_to_linear_rgb(image: torch.Tensor) -> torch.Tensor:
+    r"""Convert an sRGB image to linear RGB. Used in colorspace conversions.
+    .. image:: _static/img/rgb_to_linear_rgb.png
+    Args:
+        image: sRGB Image to be converted to linear RGB of shape :math:`(*,3,H,W)`.
+    Returns:
+        linear RGB version of the image with shape of :math:`(*,3,H,W)`.
+    Example:
+        >>> input = torch.rand(2, 3, 4, 5)
+        >>> output = rgb_to_linear_rgb(input) # 2x3x4x5
+    """
+    if not isinstance(image, torch.Tensor):
+        raise TypeError(f"Input type is not a torch.Tensor. Got {type(image)}")
+    if len(image.shape) < 3 or image.shape[-3] != 3:
+        raise ValueError(
+            f"Input size must have a shape of (*, 3, H, W).Got {image.shape}")
+    lin_rgb: torch.Tensor = torch.where(image > 0.04045, torch.pow(
+        ((image + 0.055) / 1.055), 2.4), image / 12.92)
+    return lin_rgb
+def linear_rgb_to_rgb(image: torch.Tensor) -> torch.Tensor:
+    r"""Convert a linear RGB image to sRGB. Used in colorspace conversions.
+    Args:
+        image: linear RGB Image to be converted to sRGB of shape :math:`(*,3,H,W)`.
+    Returns:
+        sRGB version of the image with shape of shape :math:`(*,3,H,W)`.
+    Example:
+        >>> input = torch.rand(2, 3, 4, 5)
+        >>> output = linear_rgb_to_rgb(input) # 2x3x4x5
+    """
+    if not isinstance(image, torch.Tensor):
+        raise TypeError(f"Input type is not a torch.Tensor. Got {type(image)}")
+    if len(image.shape) < 3 or image.shape[-3] != 3:
+        raise ValueError(
+            f"Input size must have a shape of (*, 3, H, W).Got {image.shape}")
+    threshold = 0.0031308
+    rgb: torch.Tensor = torch.where(
+        image > threshold, 1.055 *
+        torch.pow(image.clamp(min=threshold), 1 / 2.4) - 0.055, 12.92 * image
+    )
+    return rgb
+def inference_img(model, img, device='cpu'):
+    h, w, _ = img.shape
+    # print(img.shape)
+    if h % 8 != 0 or w % 8 != 0:
+        img = cv2.copyMakeBorder(img, 8-h % 8, 0, 8-w %
+                                 8, 0, cv2.BORDER_REFLECT)
+    # print(img.shape)
+    tensor_img = torch.from_numpy(img).permute(2, 0, 1).to(device)
+    input_t = tensor_img
+    input_t = input_t/255.0
+    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                     std=[0.229, 0.224, 0.225])
+    input_t = normalize(input_t)
+    input_t = input_t.unsqueeze(0).float()
+    with torch.no_grad():
+        out = model(input_t)
+    # print("out",out.shape)
+    result = out[0][:, -h:, -w:].cpu().numpy()
+    # print(result.shape)
+    return result[0]