Spaces:

NikeZoldyck
/

green-screen-composition-transfer

Build error

App Files Files Community

NikeZoldyck commited on Oct 11, 2022

Commit

4158574

•

1 Parent(s): 4ac8bc1

adding the gradio app code

Browse files

Files changed (11) hide show

app.py +152 -0
models/__init__.py +0 -0
models/components/__init__.py +0 -0
models/components/photo_wct.pth +3 -0
models/models.py +297 -0
requirements.txt +18 -0
utils/__init__.py +12 -0
utils/photo_smooth.py +101 -0
utils/photo_wct.py +171 -0
utils/shared_utils.py +136 -0
utils/smooth_filter.py +405 -0

app.py ADDED Viewed

	@@ -0,0 +1,152 @@

+from pathlib import Path
+import numpy as np
+import gradio as gr
+import utils.shared_utils as st
+import torch
+from torch import autocast
+import torchvision.transforms as T
+from contextlib import nullcontext
+device = "cuda" if torch.cuda.is_available() else "cpu"
+context = autocast if device == "cuda" else nullcontext
+# Apply the transformations needed
+def select_input(input_img,webcm_img):
+    if input_img is None:
+        img= webcm_img
+    else:
+        img=input_img
+    return img
+def infer(prompt,samples):
+    images= []
+    selections = ["Img_{}".format(str(i+1).zfill(2)) for i in range(samples)]
+    with context(device):
+        for _ in range(samples):
+            back_img = st.stableDiffusionAPICall(prompt)
+            images.append(back_img)
+    return images
+def change_bg_option(choice):
+    if choice == "I have an Image":
+        return gr.Image(shape=(800, 800))
+    elif choice == "Generate one for me":
+        return gr.update(lines=8, visible=True, value="Please enter a text prompt")
+    else:
+        return gr.update(visible=False)
+# TEXT
+title = "FSDL- One-Shot, Green-Screen,   Composition-Transfer"
+DEFAULT_TEXT = "Photorealistic scenery of bookshelf in a room"
+description = """
+<center><a href="https://docs.google.com/document/d/1fde8XKIMT1nNU72859ytd2c58LFBxepS3od9KFBrJbM/edit?usp=sharing">[PAPER]</a> <a href="https://github.com/snknitin/FSDL-Project/blob/main/src/utils/shared_utils.py">[CODE]</a></center>
+<details>
+<summary><b>Instructions</b></summary>
+<p style="margin-top: -3px;">With this app, you can generate a suitable background image to overlay your portrait!<br />You have several ways to set how your final auto-edited image will look like:<br /></p>
+ <ul style="margin-top: -20px;margin-bottom: -15px;">
+  <li style="margin-bottom: -10px;margin-left: 20px;">Use the "<i>Inputs</i>" tab to either upload an image from your device or allow the use of your webcam to capture</li>
+  <li style="margin-left: 20px;">Use the "<i>Background Image Inputs</i>" to upload your own background</li>
+  <li style="margin-left: 20px;">Use the "<i>Text prompt</i>" tab to generate a satisfactory bacground image.</li>
+</ul>
+<p>After customization, just hit "<i>Edit</i>" and wait a few seconds.<br />The final image will be available for download <br /> <b>Enjoy!<b><p>
+</details>
+"""
+running = """
+### Instructions for running the 3 S's in sequence
+* **Superimpose** - This button allows you to isolate the foreground from your image and overlay it on the background. Remove background using alpha matting
+* **Style-Transfer** - This button transfer the style from your original image to re-map your new background realistically. Uses Nvidia FastPhotoStyle
+* **Smoothing** - Given than image resolutions and clarity can be an issue, this smoothing button makes your final image crisp after the stylization transfer. Fair warning - this last process can take 5-10 mins
+"""
+demo = gr.Blocks()
+with demo:
+    gr.Markdown("<h1 style='text-align: center; margin-bottom: 1rem'>" + title + "</h1>")
+    with gr.Box():
+        gr.Markdown(description)
+    # First row - Inputs
+    with gr.Row(scale=1):
+        with gr.Column():
+            with gr.Tabs():
+                with gr.TabItem("Upload "):
+                    input_img = gr.Image(shape=(800, 800), interactive=True, label="You")
+                with gr.TabItem("Webcam Capture"):
+                    webcm_img = gr.Image(source="webcam", streaming=True, shape=(800, 800), interactive=True)
+            inp_select_btn = gr.Button("Select")
+        with gr.Column():
+            with gr.Tabs():
+                with gr.TabItem("Upload"):
+                    bgm_img = gr.Image(shape=(800, 800), type="pil", interactive=True, label="The Background")
+                    bgm_select_btn = gr.Button("Select")
+                with gr.TabItem("Generate via Text Prompt"):
+                    with gr.Box():
+                        with gr.Row().style(mobile_collapse=False, equal_height=True):
+                            text = gr.Textbox(lines=7,
+                                              placeholder="Enter your prompt to generate a background image... something like - Photorealistic scenery of bookshelf in a room")
+                            samples = gr.Slider(label="Number of Images", minimum=1, maximum=5, value=2, step=1)
+                            btn = gr.Button("Generate images",variant="primary").style(
+                                margin=False,
+                                rounded=(False, True, True, False),
+                            )
+                    gallery = gr.Gallery(label="Generated images", show_label=True).style(grid=(1, 3), height="auto")
+                    # image_options = gr.Radio(label="Pick", interactive=True, choices=None, type="value")
+                    text.submit(infer, inputs=[text, samples], outputs=gallery)
+                    btn.click(infer, inputs=[text, samples], outputs=gallery, show_progress=True, status_tracker=None)
+    # Second Row - Backgrounds
+    with gr.Row(scale=1):
+        with gr.Column():
+            final_input_img = gr.Image(shape=(800, 800), type="pil", label="Foreground")
+        with gr.Column():
+            final_back_img = gr.Image(shape=(800, 800), type="pil", label="Background", interactive=True)
+        bgm_select_btn.click(fn=lambda x: x, inputs=bgm_img, outputs=final_back_img)
+    inp_select_btn.click(select_input, [input_img, webcm_img], final_input_img)
+    with gr.Row(scale=1):
+        with gr.Box():
+            gr.Markdown(running)
+    with gr.Row(scale=1):
+        with gr.Column(scale=1):
+            supimp_btn = gr.Button("SuperImpose")
+            overlay_img = gr.Image(shape=(800, 800), label="Overlay", type="pil")
+        with gr.Column(scale=1):
+            style_btn = gr.Button("Composition-Transfer",variant="primary")
+            style_img = gr.Image(shape=(800, 800),label="Style-Transfer Image",type="pil")
+        with gr.Column(scale=1):
+            submit_btn = gr.Button("Smoothen",variant="primary")
+            output_img = gr.Image(shape=(800, 800),label="FinalSmoothened Image",type="pil")
+        supimp_btn.click(fn=st.superimpose, inputs=[final_input_img, final_back_img], outputs=[overlay_img])
+        style_btn.click(fn=st.style_transfer, inputs=[overlay_img,final_input_img], outputs=[style_img])
+        submit_btn.click(fn=st.smoother, inputs=[style_img,overlay_img], outputs=[output_img])
+demo.queue()
+demo.launch()

models/__init__.py ADDED Viewed

File without changes

models/components/__init__.py ADDED Viewed

File without changes

models/components/photo_wct.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bedc114a83833de79e92b7166b37bc522db71a30bbfa13d0c4f36387789c8af5
+size 33410469

models/models.py ADDED Viewed

	@@ -0,0 +1,297 @@

+"""
+Copyright (C) 2018 NVIDIA Corporation.  All rights reserved.
+Licensed under the CC BY-NC-SA 4.0 license (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode).
+"""
+import torch.nn as nn
+class VGGEncoder(nn.Module):
+    def __init__(self, level):
+        super(VGGEncoder, self).__init__()
+        self.level = level
+        # 224 x 224
+        self.conv0 = nn.Conv2d(3, 3, 1, 1, 0)
+        self.pad1_1 = nn.ReflectionPad2d((1, 1, 1, 1))
+        # 226 x 226
+        self.conv1_1 = nn.Conv2d(3, 64, 3, 1, 0)
+        self.relu1_1 = nn.ReLU(inplace=True)
+        # 224 x 224
+        if level < 2: return
+        self.pad1_2 = nn.ReflectionPad2d((1, 1, 1, 1))
+        self.conv1_2 = nn.Conv2d(64, 64, 3, 1, 0)
+        self.relu1_2 = nn.ReLU(inplace=True)
+        # 224 x 224
+        self.maxpool1 = nn.MaxPool2d(kernel_size=2, stride=2, return_indices=True)
+        # 112 x 112
+        self.pad2_1 = nn.ReflectionPad2d((1, 1, 1, 1))
+        self.conv2_1 = nn.Conv2d(64, 128, 3, 1, 0)
+        self.relu2_1 = nn.ReLU(inplace=True)
+        # 112 x 112
+        if level < 3: return
+        self.pad2_2 = nn.ReflectionPad2d((1, 1, 1, 1))
+        self.conv2_2 = nn.Conv2d(128, 128, 3, 1, 0)
+        self.relu2_2 = nn.ReLU(inplace=True)
+        # 112 x 112
+        self.maxpool2 = nn.MaxPool2d(kernel_size=2, stride=2, return_indices=True)
+        # 56 x 56
+        self.pad3_1 = nn.ReflectionPad2d((1, 1, 1, 1))
+        self.conv3_1 = nn.Conv2d(128, 256, 3, 1, 0)
+        self.relu3_1 = nn.ReLU(inplace=True)
+        # 56 x 56
+        if level < 4: return
+        self.pad3_2 = nn.ReflectionPad2d((1, 1, 1, 1))
+        self.conv3_2 = nn.Conv2d(256, 256, 3, 1, 0)
+        self.relu3_2 = nn.ReLU(inplace=True)
+        # 56 x 56
+        self.pad3_3 = nn.ReflectionPad2d((1, 1, 1, 1))
+        self.conv3_3 = nn.Conv2d(256, 256, 3, 1, 0)
+        self.relu3_3 = nn.ReLU(inplace=True)
+        # 56 x 56
+        self.pad3_4 = nn.ReflectionPad2d((1, 1, 1, 1))
+        self.conv3_4 = nn.Conv2d(256, 256, 3, 1, 0)
+        self.relu3_4 = nn.ReLU(inplace=True)
+        # 56 x 56
+        self.maxpool3 = nn.MaxPool2d(kernel_size=2, stride=2, return_indices=True)
+        # 28 x 28
+        self.pad4_1 = nn.ReflectionPad2d((1, 1, 1, 1))
+        self.conv4_1 = nn.Conv2d(256, 512, 3, 1, 0)
+        self.relu4_1 = nn.ReLU(inplace=True)
+        # 28 x 28
+    def forward(self, x):
+        out = self.conv0(x)
+        out = self.pad1_1(out)
+        out = self.conv1_1(out)
+        out = self.relu1_1(out)
+        if self.level < 2:
+            return out
+        out = self.pad1_2(out)
+        out = self.conv1_2(out)
+        pool1 = self.relu1_2(out)
+        out, pool1_idx = self.maxpool1(pool1)
+        out = self.pad2_1(out)
+        out = self.conv2_1(out)
+        out = self.relu2_1(out)
+        if self.level < 3:
+            return out, pool1_idx, pool1.size()
+        out = self.pad2_2(out)
+        out = self.conv2_2(out)
+        pool2 = self.relu2_2(out)
+        out, pool2_idx = self.maxpool2(pool2)
+        out = self.pad3_1(out)
+        out = self.conv3_1(out)
+        out = self.relu3_1(out)
+        if self.level < 4:
+            return out, pool1_idx, pool1.size(), pool2_idx, pool2.size()
+        out = self.pad3_2(out)
+        out = self.conv3_2(out)
+        out = self.relu3_2(out)
+        out = self.pad3_3(out)
+        out = self.conv3_3(out)
+        out = self.relu3_3(out)
+        out = self.pad3_4(out)
+        out = self.conv3_4(out)
+        pool3 = self.relu3_4(out)
+        out, pool3_idx = self.maxpool3(pool3)
+        out = self.pad4_1(out)
+        out = self.conv4_1(out)
+        out = self.relu4_1(out)
+        return out, pool1_idx, pool1.size(), pool2_idx, pool2.size(), pool3_idx, pool3.size()
+    def forward_multiple(self, x):
+        out = self.conv0(x)
+        out = self.pad1_1(out)
+        out = self.conv1_1(out)
+        out = self.relu1_1(out)
+        if self.level < 2: return out
+        out1 = out
+        out = self.pad1_2(out)
+        out = self.conv1_2(out)
+        pool1 = self.relu1_2(out)
+        out, pool1_idx = self.maxpool1(pool1)
+        out = self.pad2_1(out)
+        out = self.conv2_1(out)
+        out = self.relu2_1(out)
+        if self.level < 3: return out, out1
+        out2 = out
+        out = self.pad2_2(out)
+        out = self.conv2_2(out)
+        pool2 = self.relu2_2(out)
+        out, pool2_idx = self.maxpool2(pool2)
+        out = self.pad3_1(out)
+        out = self.conv3_1(out)
+        out = self.relu3_1(out)
+        if self.level < 4: return out, out2, out1
+        out3 = out
+        out = self.pad3_2(out)
+        out = self.conv3_2(out)
+        out = self.relu3_2(out)
+        out = self.pad3_3(out)
+        out = self.conv3_3(out)
+        out = self.relu3_3(out)
+        out = self.pad3_4(out)
+        out = self.conv3_4(out)
+        pool3 = self.relu3_4(out)
+        out, pool3_idx = self.maxpool3(pool3)
+        out = self.pad4_1(out)
+        out = self.conv4_1(out)
+        out = self.relu4_1(out)
+        return out, out3, out2, out1
+class VGGDecoder(nn.Module):
+    def __init__(self, level):
+        super(VGGDecoder, self).__init__()
+        self.level = level
+        if level > 3:
+            self.pad4_1 = nn.ReflectionPad2d((1, 1, 1, 1))
+            self.conv4_1 = nn.Conv2d(512, 256, 3, 1, 0)
+            self.relu4_1 = nn.ReLU(inplace=True)
+            # 28 x 28
+            self.unpool3 = nn.MaxUnpool2d(kernel_size=2, stride=2)
+            # 56 x 56
+            self.pad3_4 = nn.ReflectionPad2d((1, 1, 1, 1))
+            self.conv3_4 = nn.Conv2d(256, 256, 3, 1, 0)
+            self.relu3_4 = nn.ReLU(inplace=True)
+            # 56 x 56
+            self.pad3_3 = nn.ReflectionPad2d((1, 1, 1, 1))
+            self.conv3_3 = nn.Conv2d(256, 256, 3, 1, 0)
+            self.relu3_3 = nn.ReLU(inplace=True)
+            # 56 x 56
+            self.pad3_2 = nn.ReflectionPad2d((1, 1, 1, 1))
+            self.conv3_2 = nn.Conv2d(256, 256, 3, 1, 0)
+            self.relu3_2 = nn.ReLU(inplace=True)
+            # 56 x 56
+        if level > 2:
+            self.pad3_1 = nn.ReflectionPad2d((1, 1, 1, 1))
+            self.conv3_1 = nn.Conv2d(256, 128, 3, 1, 0)
+            self.relu3_1 = nn.ReLU(inplace=True)
+            # 56 x 56
+            self.unpool2 = nn.MaxUnpool2d(kernel_size=2, stride=2)
+            # 112 x 112
+            self.pad2_2 = nn.ReflectionPad2d((1, 1, 1, 1))
+            self.conv2_2 = nn.Conv2d(128, 128, 3, 1, 0)
+            self.relu2_2 = nn.ReLU(inplace=True)
+            # 112 x 112
+        if level > 1:
+            self.pad2_1 = nn.ReflectionPad2d((1, 1, 1, 1))
+            self.conv2_1 = nn.Conv2d(128, 64, 3, 1, 0)
+            self.relu2_1 = nn.ReLU(inplace=True)
+            # 112 x 112
+            self.unpool1 = nn.MaxUnpool2d(kernel_size=2, stride=2)
+            # 224 x 224
+            self.pad1_2 = nn.ReflectionPad2d((1, 1, 1, 1))
+            self.conv1_2 = nn.Conv2d(64, 64, 3, 1, 0)
+            self.relu1_2 = nn.ReLU(inplace=True)
+            # 224 x 224
+        if level > 0:
+            self.pad1_1 = nn.ReflectionPad2d((1, 1, 1, 1))
+            self.conv1_1 = nn.Conv2d(64, 3, 3, 1, 0)
+    def forward(self, x, pool1_idx=None, pool1_size=None, pool2_idx=None, pool2_size=None, pool3_idx=None,
+                pool3_size=None):
+        out = x
+        if self.level > 3:
+            out = self.pad4_1(out)
+            out = self.conv4_1(out)
+            out = self.relu4_1(out)
+            out = self.unpool3(out, pool3_idx, output_size=pool3_size)
+            out = self.pad3_4(out)
+            out = self.conv3_4(out)
+            out = self.relu3_4(out)
+            out = self.pad3_3(out)
+            out = self.conv3_3(out)
+            out = self.relu3_3(out)
+            out = self.pad3_2(out)
+            out = self.conv3_2(out)
+            out = self.relu3_2(out)
+        if self.level > 2:
+            out = self.pad3_1(out)
+            out = self.conv3_1(out)
+            out = self.relu3_1(out)
+            out = self.unpool2(out, pool2_idx, output_size=pool2_size)
+            out = self.pad2_2(out)
+            out = self.conv2_2(out)
+            out = self.relu2_2(out)
+        if self.level > 1:
+            out = self.pad2_1(out)
+            out = self.conv2_1(out)
+            out = self.relu2_1(out)
+            out = self.unpool1(out, pool1_idx, output_size=pool1_size)
+            out = self.pad1_2(out)
+            out = self.conv1_2(out)
+            out = self.relu1_2(out)
+        if self.level > 0:
+            out = self.pad1_1(out)
+            out = self.conv1_1(out)
+        return out

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+--extra-index-url https://download.pytorch.org/whl/cu116
+torch
+diffusers
+transformers
+scipy
+ftfy
+gradio
+torchvision
+scikit-image
+rembg
+replicate
+requests
+Pillow
+numpy
+scipy
+pyrootutils
+pynvrtc
+cupy

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from src.utils.pylogger import get_pylogger
+from src.utils.rich_utils import enforce_tags, print_config_tree
+from src.utils.utils import (
+    close_loggers,
+    extras,
+    get_metric_value,
+    instantiate_callbacks,
+    instantiate_loggers,
+    log_hyperparameters,
+    save_file,
+    task_wrapper,
+)

utils/photo_smooth.py ADDED Viewed

	@@ -0,0 +1,101 @@

+"""
+Copyright (C) 2018 NVIDIA Corporation.    All rights reserved.
+Licensed under the CC BY-NC-SA 4.0 license (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode).
+"""
+from __future__ import division
+import torch.nn as nn
+import scipy.misc
+import scipy._lib
+import numpy as np
+import scipy.sparse
+import scipy.sparse.linalg as linalg
+from numpy.lib.stride_tricks import as_strided
+from PIL import Image
+class Propagator(nn.Module):
+    def __init__(self, beta=0.9999):
+        super(Propagator, self).__init__()
+        self.beta = beta
+    def process(self, initImg, contentImg):
+        if type(contentImg) == str:
+            content = scipy.misc.imread(contentImg, mode='RGB')
+        else:
+            content = contentImg.copy()
+        # content = scipy.misc.imread(contentImg, mode='RGB')
+        if type(initImg) == str:
+            B = scipy.misc.imread(initImg, mode='RGB').astype(np.float64) / 255
+        else:
+            B = scipy.asarray(initImg).astype(np.float64) / 255
+            # B = self.
+        # B = scipy.misc.imread(initImg, mode='RGB').astype(np.float64)/255
+        h1,w1,k = B.shape
+        h = h1 - 4
+        w = w1 - 4
+        B = B[int((h1-h)/2):int((h1-h)/2+h),int((w1-w)/2):int((w1-w)/2+w),:]
+        #content = scipy.misc.imresize(content,(h,w))
+        content = np.asarray(Image.fromarray(np.array(content)).resize((h,w),Image.BICUBIC))
+        B = self.__replication_padding(B,2)
+        content = self.__replication_padding(content,2)
+        content = content.astype(np.float64)/255
+        B = np.reshape(B,(h1*w1,k))
+        W = self.__compute_laplacian(content)
+        W = W.tocsc()
+        dd = W.sum(0)
+        dd = np.sqrt(np.power(dd,-1))
+        dd = dd.A.squeeze()
+        D = scipy.sparse.csc_matrix((dd, (np.arange(0,w1*h1), np.arange(0,w1*h1)))) # 0.026
+        S = D.dot(W).dot(D)
+        A = scipy.sparse.identity(w1*h1) - self.beta*S
+        A = A.tocsc()
+        solver = linalg.factorized(A)
+        V = np.zeros((h1*w1,k))
+        V[:,0] = solver(B[:,0])
+        V[:,1] = solver(B[:,1])
+        V[:,2] = solver(B[:,2])
+        V = V*(1-self.beta)
+        V = V.reshape(h1,w1,k)
+        V = V[2:2+h,2:2+w,:]
+        img = Image.fromarray(np.uint8(np.clip(V * 255., 0, 255.)))
+        return img
+    # Returns sparse matting laplacian
+    # The implementation of the function is heavily borrowed from
+    # https://github.com/MarcoForte/closed-form-matting/blob/master/closed_form_matting.py
+    # We thank Marco Forte for sharing his code.
+    def __compute_laplacian(self, img, eps=10**(-7), win_rad=1):
+            win_size = (win_rad*2+1)**2
+            h, w, d = img.shape
+            c_h, c_w = h - 2*win_rad, w - 2*win_rad
+            win_diam = win_rad*2+1
+            indsM = np.arange(h*w).reshape((h, w))
+            ravelImg = img.reshape(h*w, d)
+            win_inds = self.__rolling_block(indsM, block=(win_diam, win_diam))
+            win_inds = win_inds.reshape(c_h, c_w, win_size)
+            winI = ravelImg[win_inds]
+            win_mu = np.mean(winI, axis=2, keepdims=True)
+            win_var = np.einsum('...ji,...jk ->...ik', winI, winI)/win_size - np.einsum('...ji,...jk ->...ik', win_mu, win_mu)
+            inv = np.linalg.inv(win_var + (eps/win_size)*np.eye(3))
+            X = np.einsum('...ij,...jk->...ik', winI - win_mu, inv)
+            vals = (1/win_size)*(1 + np.einsum('...ij,...kj->...ik', X, winI - win_mu))
+            nz_indsCol = np.tile(win_inds, win_size).ravel()
+            nz_indsRow = np.repeat(win_inds, win_size).ravel()
+            nz_indsVal = vals.ravel()
+            L = scipy.sparse.coo_matrix((nz_indsVal, (nz_indsRow, nz_indsCol)), shape=(h*w, h*w))
+            return L
+    def __replication_padding(self, arr,pad):
+            h,w,c = arr.shape
+            ans = np.zeros((h+pad*2,w+pad*2,c))
+            for i in range(c):
+                    ans[:,:,i] = np.pad(arr[:,:,i],pad_width=(pad,pad),mode='edge')
+            return ans
+    def __rolling_block(self, A, block=(3, 3)):
+        shape = (A.shape[0] - block[0] + 1, A.shape[1] - block[1] + 1) + block
+        strides = (A.strides[0], A.strides[1]) + A.strides
+        return as_strided(A, shape=shape, strides=strides)

utils/photo_wct.py ADDED Viewed

	@@ -0,0 +1,171 @@

+"""
+Copyright (C) 2018 NVIDIA Corporation.  All rights reserved.
+Licensed under the CC BY-NC-SA 4.0 license (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode).
+"""
+import numpy as np
+from PIL import Image
+import torch
+import torch.nn as nn
+from models.models import VGGEncoder, VGGDecoder
+class PhotoWCT(nn.Module):
+    def __init__(self):
+        super(PhotoWCT, self).__init__()
+        self.e1 = VGGEncoder(1)
+        self.d1 = VGGDecoder(1)
+        self.e2 = VGGEncoder(2)
+        self.d2 = VGGDecoder(2)
+        self.e3 = VGGEncoder(3)
+        self.d3 = VGGDecoder(3)
+        self.e4 = VGGEncoder(4)
+        self.d4 = VGGDecoder(4)
+    def transform(self, cont_img, styl_img, cont_seg, styl_seg):
+        self.__compute_label_info(cont_seg, styl_seg)
+        sF4, sF3, sF2, sF1 = self.e4.forward_multiple(styl_img)
+        cF4, cpool_idx, cpool1, cpool_idx2, cpool2, cpool_idx3, cpool3 = self.e4(cont_img)
+        sF4 = sF4.data.squeeze(0)
+        cF4 = cF4.data.squeeze(0)
+        # print(cont_seg)
+        csF4 = self.__feature_wct(cF4, sF4, cont_seg, styl_seg)
+        Im4 = self.d4(csF4, cpool_idx, cpool1, cpool_idx2, cpool2, cpool_idx3, cpool3)
+        cF3, cpool_idx, cpool1, cpool_idx2, cpool2 = self.e3(Im4)
+        sF3 = sF3.data.squeeze(0)
+        cF3 = cF3.data.squeeze(0)
+        csF3 = self.__feature_wct(cF3, sF3, cont_seg, styl_seg)
+        Im3 = self.d3(csF3, cpool_idx, cpool1, cpool_idx2, cpool2)
+        cF2, cpool_idx, cpool = self.e2(Im3)
+        sF2 = sF2.data.squeeze(0)
+        cF2 = cF2.data.squeeze(0)
+        csF2 = self.__feature_wct(cF2, sF2, cont_seg, styl_seg)
+        Im2 = self.d2(csF2, cpool_idx, cpool)
+        cF1 = self.e1(Im2)
+        sF1 = sF1.data.squeeze(0)
+        cF1 = cF1.data.squeeze(0)
+        csF1 = self.__feature_wct(cF1, sF1, cont_seg, styl_seg)
+        Im1 = self.d1(csF1)
+        return Im1
+    def __compute_label_info(self, cont_seg, styl_seg):
+        if cont_seg.size == False or styl_seg.size == False:
+            return
+        max_label = np.max(cont_seg) + 1
+        self.label_set = np.unique(cont_seg)
+        self.label_indicator = np.zeros(max_label)
+        for l in self.label_set:
+            # if l==0:
+            #   continue
+            is_valid = lambda a, b: a > 10 and b > 10 and a / b < 100 and b / a < 100
+            o_cont_mask = np.where(cont_seg.reshape(cont_seg.shape[0] * cont_seg.shape[1]) == l)
+            o_styl_mask = np.where(styl_seg.reshape(styl_seg.shape[0] * styl_seg.shape[1]) == l)
+            self.label_indicator[l] = is_valid(o_cont_mask[0].size, o_styl_mask[0].size)
+    def __feature_wct(self, cont_feat, styl_feat, cont_seg, styl_seg):
+        cont_c, cont_h, cont_w = cont_feat.size(0), cont_feat.size(1), cont_feat.size(2)
+        styl_c, styl_h, styl_w = styl_feat.size(0), styl_feat.size(1), styl_feat.size(2)
+        cont_feat_view = cont_feat.view(cont_c, -1).clone()
+        styl_feat_view = styl_feat.view(styl_c, -1).clone()
+        if cont_seg.size == False or styl_seg.size == False:
+            target_feature = self.__wct_core(cont_feat_view, styl_feat_view)
+        else:
+            target_feature = cont_feat.view(cont_c, -1).clone()
+            if len(cont_seg.shape) == 2:
+                t_cont_seg = np.asarray(Image.fromarray(cont_seg).resize((cont_w, cont_h), Image.NEAREST))
+            else:
+                t_cont_seg = np.asarray(Image.fromarray(cont_seg, mode='RGB').resize((cont_w, cont_h), Image.NEAREST))
+            if len(styl_seg.shape) == 2:
+                t_styl_seg = np.asarray(Image.fromarray(styl_seg).resize((styl_w, styl_h), Image.NEAREST))
+            else:
+                t_styl_seg = np.asarray(Image.fromarray(styl_seg, mode='RGB').resize((styl_w, styl_h), Image.NEAREST))
+            for l in self.label_set:
+                if self.label_indicator[l] == 0:
+                    continue
+                cont_mask = np.where(t_cont_seg.reshape(t_cont_seg.shape[0] * t_cont_seg.shape[1]) == l)
+                styl_mask = np.where(t_styl_seg.reshape(t_styl_seg.shape[0] * t_styl_seg.shape[1]) == l)
+                if cont_mask[0].size <= 0 or styl_mask[0].size <= 0:
+                    continue
+                cont_indi = torch.LongTensor(cont_mask[0])
+                styl_indi = torch.LongTensor(styl_mask[0])
+                if self.is_cuda:
+                    cont_indi = cont_indi.cuda(0)
+                    styl_indi = styl_indi.cuda(0)
+                cFFG = torch.index_select(cont_feat_view, 1, cont_indi)
+                sFFG = torch.index_select(styl_feat_view, 1, styl_indi)
+                # print(len(cont_indi))
+                # print(len(styl_indi))
+                tmp_target_feature = self.__wct_core(cFFG, sFFG)
+                # print(tmp_target_feature.size())
+                if torch.__version__ >= "0.4.0":
+                    # This seems to be a bug in PyTorch 0.4.0 to me.
+                    new_target_feature = torch.transpose(target_feature, 1, 0)
+                    new_target_feature.index_copy_(0, cont_indi, \
+                            torch.transpose(tmp_target_feature,1,0))
+                    target_feature = torch.transpose(new_target_feature, 1, 0)
+                else:
+                    target_feature.index_copy_(1, cont_indi, tmp_target_feature)
+        target_feature = target_feature.view_as(cont_feat)
+        ccsF = target_feature.float().unsqueeze(0)
+        return ccsF
+    def __wct_core(self, cont_feat, styl_feat):
+        cFSize = cont_feat.size()
+        c_mean = torch.mean(cont_feat, 1)  # c x (h x w)
+        c_mean = c_mean.unsqueeze(1).expand_as(cont_feat)
+        cont_feat = cont_feat - c_mean
+        iden = torch.eye(cFSize[0])  # .double()
+        if self.is_cuda:
+            iden = iden.cuda()
+        contentConv = torch.mm(cont_feat, cont_feat.t()).div(cFSize[1] - 1) + iden
+        # del iden
+        c_u, c_e, c_v = torch.svd(contentConv, some=False)
+        # c_e2, c_v = torch.eig(contentConv, True)
+        # c_e = c_e2[:,0]
+        k_c = cFSize[0]
+        for i in range(cFSize[0] - 1, -1, -1):
+            if c_e[i] >= 0.00001:
+                k_c = i + 1
+                break
+        sFSize = styl_feat.size()
+        s_mean = torch.mean(styl_feat, 1)
+        styl_feat = styl_feat - s_mean.unsqueeze(1).expand_as(styl_feat)
+        styleConv = torch.mm(styl_feat, styl_feat.t()).div(sFSize[1] - 1)
+        s_u, s_e, s_v = torch.svd(styleConv, some=False)
+        k_s = sFSize[0]
+        for i in range(sFSize[0] - 1, -1, -1):
+            if s_e[i] >= 0.00001:
+                k_s = i + 1
+                break
+        c_d = (c_e[0:k_c]).pow(-0.5)
+        step1 = torch.mm(c_v[:, 0:k_c], torch.diag(c_d))
+        step2 = torch.mm(step1, (c_v[:, 0:k_c].t()))
+        whiten_cF = torch.mm(step2, cont_feat)
+        s_d = (s_e[0:k_s]).pow(0.5)
+        targetFeature = torch.mm(torch.mm(torch.mm(s_v[:, 0:k_s], torch.diag(s_d)), (s_v[:, 0:k_s].t())), whiten_cF)
+        targetFeature = targetFeature + s_mean.unsqueeze(1).expand_as(targetFeature)
+        return targetFeature
+    @property
+    def is_cuda(self):
+        return next(self.parameters()).is_cuda
+    def forward(self, *input):
+        pass

utils/shared_utils.py ADDED Viewed

	@@ -0,0 +1,136 @@

+from pathlib import Path
+from rembg import remove
+import io
+# Apply the transformations needed
+from torch import autocast, nn
+import torch
+import torch.nn as nn
+import torch
+import torchvision.transforms as transforms
+import torchvision.utils as utils
+import torch.nn as nn
+import pyrootutils
+from PIL import Image
+import numpy as np
+from utils.photo_wct import PhotoWCT
+from utils.photo_smooth import Propagator
+# Load models
+root = pyrootutils.setup_root(Path.cwd(), pythonpath=True)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load model
+p_wct = PhotoWCT()
+p_wct.load_state_dict(torch.load(root/"models/components/photo_wct.pth"))
+p_pro = Propagator()
+stylization_module=p_wct
+smoothing_module=p_pro
+#Dependecies - To be installed -
+#!pip install replicate
+#Token - To be authenticated -
+#API TOKEN - 664474670af075461f85420f7b1d23d18484f826
+#To be declared as an environment variable -
+#export REPLICATE_API_TOKEN =
+import replicate
+import os
+import requests
+def stableDiffusionAPICall(text_prompt):
+    os.environ['REPLICATE_API_TOKEN'] = 'a9f4c06cb9808f42b29637bb60b7b88f106ad5b8'
+    model = replicate.models.get("stability-ai/stable-diffusion")
+    #text_prompt = 'photorealistic, elf fighting Sauron'
+    gen_bg_img = model.predict(prompt=text_prompt)[0]
+    img_data = requests.get(gen_bg_img).content
+    # r_data = binascii.unhexlify(img_data)
+    stream = io.BytesIO(img_data)
+    img = Image.open(stream)
+    del img_data
+    return img
+def memory_limit_image_resize(cont_img):
+    # prevent too small or too big images
+    MINSIZE=400
+    MAXSIZE=800
+    orig_width = cont_img.width
+    orig_height = cont_img.height
+    if max(cont_img.width,cont_img.height) < MINSIZE:
+        if cont_img.width > cont_img.height:
+            cont_img.thumbnail((int(cont_img.width*1.0/cont_img.height*MINSIZE), MINSIZE), Image.BICUBIC)
+        else:
+            cont_img.thumbnail((MINSIZE, int(cont_img.height*1.0/cont_img.width*MINSIZE)), Image.BICUBIC)
+    if min(cont_img.width,cont_img.height) > MAXSIZE:
+        if cont_img.width > cont_img.height:
+            cont_img.thumbnail((MAXSIZE, int(cont_img.height*1.0/cont_img.width*MAXSIZE)), Image.BICUBIC)
+        else:
+            cont_img.thumbnail(((int(cont_img.width*1.0/cont_img.height*MAXSIZE), MAXSIZE)), Image.BICUBIC)
+    print("Resize image: (%d,%d)->(%d,%d)" % (orig_width, orig_height, cont_img.width, cont_img.height))
+    return cont_img.width, cont_img.height
+def superimpose(input_img,back_img):
+    matte_img = remove(input_img)
+    back_img.paste(matte_img, (0, 0), matte_img)
+    return back_img
+def style_transfer(cont_img,styl_img):
+    with torch.no_grad():
+        new_cw, new_ch = memory_limit_image_resize(cont_img)
+        new_sw, new_sh = memory_limit_image_resize(styl_img)
+        cont_pilimg = cont_img.copy()
+        cw = cont_pilimg.width
+        ch = cont_pilimg.height
+        cont_img = transforms.ToTensor()(cont_img).unsqueeze(0)
+        styl_img = transforms.ToTensor()(styl_img).unsqueeze(0)
+        cont_seg = []
+        styl_seg = []
+        if device == 'cuda':
+            cont_img = cont_img.to(device)
+            styl_img = styl_img.to(device)
+            stylization_module.to(device)
+        cont_seg = np.asarray(cont_seg)
+        styl_seg = np.asarray(styl_seg)
+        stylized_img = stylization_module.transform(cont_img, styl_img, cont_seg, styl_seg)
+        if ch != new_ch or cw != new_cw:
+            stylized_img = nn.functional.upsample(stylized_img, size=(ch, cw), mode='bilinear')
+        grid = utils.make_grid(stylized_img.data, nrow=1, padding=0)
+        ndarr = grid.mul(255).clamp(0, 255).byte().permute(1, 2, 0).cpu().numpy()
+        stylized_img = Image.fromarray(ndarr)
+        #final_img = smooth_filter(stylized_img, cont_pilimg, f_radius=15, f_edge=1e-1)
+    return stylized_img
+def smoother(stylized_img, over_img):
+    final_img = smoothing_module.process(stylized_img, over_img)
+    return final_img
+if __name__ == "__main__":
+    root = pyrootutils.setup_root(__file__, pythonpath=True)
+    fg_path = root/"notebooks/profile_new.png"
+    bg_path = root/"notebooks/back_img.png"
+    ckpt_path = root/"src/models/MODNet/pretrained/modnet_photographic_portrait_matting.ckpt"
+    #stableDiffusionAPICall("Photorealistic scenery of a concert")
+    fg_img = Image.open(fg_path).resize((800,800))
+    bg_img = Image.open(bg_path).resize((800,800))
+    #img = combined_display(fg_img, bg_img,ckpt_path)
+    img = superimpose(fg_img,bg_img)
+    img.save(root/"notebooks/overlay.png")
+    # bg_img.paste(img, (0, 0), img)
+    # bg_img.save(root/"notebooks/check.png")

utils/smooth_filter.py ADDED Viewed

	@@ -0,0 +1,405 @@

+"""
+Copyright (C) 2018 NVIDIA Corporation.  All rights reserved.
+Licensed under the CC BY-NC-SA 4.0 license (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode).
+"""
+src = '''
+	#include "/usr/local/cuda/include/math_functions.h"
+	#define TB 256
+	#define EPS 1e-7
+	__device__ bool InverseMat4x4(double m_in[4][4], double inv_out[4][4]) {
+		double m[16], inv[16];
+		for (int i = 0; i < 4; i++) {
+			for (int j = 0; j < 4; j++) {
+				m[i * 4 + j] = m_in[i][j];
+			}
+		}
+	    inv[0] = m[5]  * m[10] * m[15] -
+	             m[5]  * m[11] * m[14] -
+	             m[9]  * m[6]  * m[15] +
+	             m[9]  * m[7]  * m[14] +
+	             m[13] * m[6]  * m[11] -
+	             m[13] * m[7]  * m[10];
+	    inv[4] = -m[4]  * m[10] * m[15] +
+	              m[4]  * m[11] * m[14] +
+	              m[8]  * m[6]  * m[15] -
+	              m[8]  * m[7]  * m[14] -
+	              m[12] * m[6]  * m[11] +
+	              m[12] * m[7]  * m[10];
+	    inv[8] = m[4]  * m[9] * m[15] -
+	             m[4]  * m[11] * m[13] -
+	             m[8]  * m[5] * m[15] +
+	             m[8]  * m[7] * m[13] +
+	             m[12] * m[5] * m[11] -
+	             m[12] * m[7] * m[9];
+	    inv[12] = -m[4]  * m[9] * m[14] +
+	               m[4]  * m[10] * m[13] +
+	               m[8]  * m[5] * m[14] -
+	               m[8]  * m[6] * m[13] -
+	               m[12] * m[5] * m[10] +
+	               m[12] * m[6] * m[9];
+	    inv[1] = -m[1]  * m[10] * m[15] +
+	              m[1]  * m[11] * m[14] +
+	              m[9]  * m[2] * m[15] -
+	              m[9]  * m[3] * m[14] -
+	              m[13] * m[2] * m[11] +
+	              m[13] * m[3] * m[10];
+	    inv[5] = m[0]  * m[10] * m[15] -
+	             m[0]  * m[11] * m[14] -
+	             m[8]  * m[2] * m[15] +
+	             m[8]  * m[3] * m[14] +
+	             m[12] * m[2] * m[11] -
+	             m[12] * m[3] * m[10];
+	    inv[9] = -m[0]  * m[9] * m[15] +
+	              m[0]  * m[11] * m[13] +
+	              m[8]  * m[1] * m[15] -
+	              m[8]  * m[3] * m[13] -
+	              m[12] * m[1] * m[11] +
+	              m[12] * m[3] * m[9];
+	    inv[13] = m[0]  * m[9] * m[14] -
+	              m[0]  * m[10] * m[13] -
+	              m[8]  * m[1] * m[14] +
+	              m[8]  * m[2] * m[13] +
+	              m[12] * m[1] * m[10] -
+	              m[12] * m[2] * m[9];
+	    inv[2] = m[1]  * m[6] * m[15] -
+	             m[1]  * m[7] * m[14] -
+	             m[5]  * m[2] * m[15] +
+	             m[5]  * m[3] * m[14] +
+	             m[13] * m[2] * m[7] -
+	             m[13] * m[3] * m[6];
+	    inv[6] = -m[0]  * m[6] * m[15] +
+	              m[0]  * m[7] * m[14] +
+	              m[4]  * m[2] * m[15] -
+	              m[4]  * m[3] * m[14] -
+	              m[12] * m[2] * m[7] +
+	              m[12] * m[3] * m[6];
+	    inv[10] = m[0]  * m[5] * m[15] -
+	              m[0]  * m[7] * m[13] -
+	              m[4]  * m[1] * m[15] +
+	              m[4]  * m[3] * m[13] +
+	              m[12] * m[1] * m[7] -
+	              m[12] * m[3] * m[5];
+	    inv[14] = -m[0]  * m[5] * m[14] +
+	               m[0]  * m[6] * m[13] +
+	               m[4]  * m[1] * m[14] -
+	               m[4]  * m[2] * m[13] -
+	               m[12] * m[1] * m[6] +
+	               m[12] * m[2] * m[5];
+	    inv[3] = -m[1] * m[6] * m[11] +
+	              m[1] * m[7] * m[10] +
+	              m[5] * m[2] * m[11] -
+	              m[5] * m[3] * m[10] -
+	              m[9] * m[2] * m[7] +
+	              m[9] * m[3] * m[6];
+	    inv[7] = m[0] * m[6] * m[11] -
+	             m[0] * m[7] * m[10] -
+	             m[4] * m[2] * m[11] +
+	             m[4] * m[3] * m[10] +
+	             m[8] * m[2] * m[7] -
+	             m[8] * m[3] * m[6];
+	    inv[11] = -m[0] * m[5] * m[11] +
+	               m[0] * m[7] * m[9] +
+	               m[4] * m[1] * m[11] -
+	               m[4] * m[3] * m[9] -
+	               m[8] * m[1] * m[7] +
+	               m[8] * m[3] * m[5];
+	    inv[15] = m[0] * m[5] * m[10] -
+	              m[0] * m[6] * m[9] -
+	              m[4] * m[1] * m[10] +
+	              m[4] * m[2] * m[9] +
+	              m[8] * m[1] * m[6] -
+	              m[8] * m[2] * m[5];
+	    double det = m[0] * inv[0] + m[1] * inv[4] + m[2] * inv[8] + m[3] * inv[12];
+	    if (abs(det) < 1e-9) {
+	        return false;
+	    }
+	    det = 1.0 / det;
+	    for (int i = 0; i < 4; i++) {
+	    	for (int j = 0; j < 4; j++) {
+	    		inv_out[i][j] = inv[i * 4 + j] * det;
+	    	}
+	    }
+	    return true;
+	}
+  extern "C"
+	__global__ void best_local_affine_kernel(
+		float *output, float *input, float *affine_model,
+		int h, int w, float epsilon, int kernel_radius
+	)
+	{
+		int size = h * w;
+		int id = blockIdx.x * blockDim.x + threadIdx.x;
+		if (id < size) {
+			int x = id % w, y = id / w;
+			double Mt_M[4][4] = {}; // 4x4
+			double invMt_M[4][4] = {};
+			double Mt_S[3][4] = {}; // RGB -> 1x4
+			double A[3][4] = {};
+			for (int i = 0; i < 4; i++)
+				for (int j = 0; j < 4; j++) {
+					Mt_M[i][j] = 0, invMt_M[i][j] = 0;
+					if (i != 3) {
+						Mt_S[i][j] = 0, A[i][j] = 0;
+						if (i == j)
+				    		Mt_M[i][j] = 1e-3;
+				    }
+				}
+			for (int dy = -kernel_radius; dy <= kernel_radius; dy++) {
+				for (int dx = -kernel_radius; dx <= kernel_radius; dx++) {
+					int xx = x + dx, yy = y + dy;
+					int id2 = yy * w + xx;
+					if (0 <= xx && xx < w && 0 <= yy && yy < h) {
+						Mt_M[0][0] += input[id2 + 2*size] * input[id2 + 2*size];
+						Mt_M[0][1] += input[id2 + 2*size] * input[id2 + size];
+						Mt_M[0][2] += input[id2 + 2*size] * input[id2];
+						Mt_M[0][3] += input[id2 + 2*size];
+						Mt_M[1][0] += input[id2 + size] * input[id2 + 2*size];
+						Mt_M[1][1] += input[id2 + size] * input[id2 + size];
+						Mt_M[1][2] += input[id2 + size] * input[id2];
+						Mt_M[1][3] += input[id2 + size];
+						Mt_M[2][0] += input[id2] * input[id2 + 2*size];
+						Mt_M[2][1] += input[id2] * input[id2 + size];
+						Mt_M[2][2] += input[id2] * input[id2];
+						Mt_M[2][3] += input[id2];
+						Mt_M[3][0] += input[id2 + 2*size];
+						Mt_M[3][1] += input[id2 + size];
+						Mt_M[3][2] += input[id2];
+						Mt_M[3][3] += 1;
+						Mt_S[0][0] += input[id2 + 2*size] * output[id2 + 2*size];
+						Mt_S[0][1] += input[id2 + size] * output[id2 + 2*size];
+						Mt_S[0][2] += input[id2] * output[id2 + 2*size];
+						Mt_S[0][3] += output[id2 + 2*size];
+						Mt_S[1][0] += input[id2 + 2*size] * output[id2 + size];
+						Mt_S[1][1] += input[id2 + size] * output[id2 + size];
+						Mt_S[1][2] += input[id2] * output[id2 + size];
+						Mt_S[1][3] += output[id2 + size];
+						Mt_S[2][0] += input[id2 + 2*size] * output[id2];
+						Mt_S[2][1] += input[id2 + size] * output[id2];
+						Mt_S[2][2] += input[id2] * output[id2];
+						Mt_S[2][3] += output[id2];
+					}
+				}
+			}
+			bool success = InverseMat4x4(Mt_M, invMt_M);
+			for (int i = 0; i < 3; i++) {
+				for (int j = 0; j < 4; j++) {
+					for (int k = 0; k < 4; k++) {
+						A[i][j] += invMt_M[j][k] * Mt_S[i][k];
+					}
+				}
+			}
+			for (int i = 0; i < 3; i++) {
+				for (int j = 0; j < 4; j++) {
+					int affine_id = i * 4 + j;
+					affine_model[12 * id + affine_id] = A[i][j];
+				}
+			}
+		}
+		return ;
+	}
+  extern "C"
+	__global__ void bilateral_smooth_kernel(
+		float *affine_model, float *filtered_affine_model, float *guide,
+		int h, int w, int kernel_radius, float sigma1, float sigma2
+	)
+	{
+		int id = blockIdx.x * blockDim.x + threadIdx.x;
+		int size = h * w;
+		if (id < size) {
+			int x = id % w;
+			int y = id / w;
+			double sum_affine[12] = {};
+			double sum_weight = 0;
+			for (int dx = -kernel_radius; dx <= kernel_radius; dx++) {
+				for (int dy = -kernel_radius; dy <= kernel_radius; dy++) {
+					int yy = y + dy, xx = x + dx;
+					int id2 = yy * w + xx;
+					if (0 <= xx && xx < w && 0 <= yy && yy < h) {
+						float color_diff1 = guide[yy*w + xx] - guide[y*w + x];
+						float color_diff2 = guide[yy*w + xx + size] - guide[y*w + x + size];
+						float color_diff3 = guide[yy*w + xx + 2*size] - guide[y*w + x + 2*size];
+						float color_diff_sqr =
+							(color_diff1*color_diff1 + color_diff2*color_diff2 + color_diff3*color_diff3) / 3;
+						float v1 = exp(-(dx * dx + dy * dy) / (2 * sigma1 * sigma1));
+						float v2 = exp(-(color_diff_sqr) / (2 * sigma2 * sigma2));
+						float weight = v1 * v2;
+						for (int i = 0; i < 3; i++) {
+							for (int j = 0; j < 4; j++) {
+								int affine_id = i * 4 + j;
+								sum_affine[affine_id] += weight * affine_model[id2*12 + affine_id];
+							}
+						}
+						sum_weight += weight;
+					}
+				}
+			}
+			for (int i = 0; i < 3; i++) {
+				for (int j = 0; j < 4; j++) {
+					int affine_id = i * 4 + j;
+					filtered_affine_model[id*12 + affine_id] = sum_affine[affine_id] / sum_weight;
+				}
+			}
+		}
+		return ;
+	}
+  extern "C"
+	__global__ void reconstruction_best_kernel(
+		float *input, float *filtered_affine_model, float *filtered_best_output,
+		int h, int w
+	)
+	{
+		int id = blockIdx.x * blockDim.x + threadIdx.x;
+		int size = h * w;
+		if (id < size) {
+			double out1 =
+				input[id + 2*size] * filtered_affine_model[id*12 + 0] + // A[0][0] +
+				input[id + size]   * filtered_affine_model[id*12 + 1] + // A[0][1] +
+				input[id]          * filtered_affine_model[id*12 + 2] + // A[0][2] +
+									 filtered_affine_model[id*12 + 3]; //A[0][3];
+			double out2 =
+				input[id + 2*size] * filtered_affine_model[id*12 + 4] + //A[1][0] +
+				input[id + size]   * filtered_affine_model[id*12 + 5] + //A[1][1] +
+				input[id]          * filtered_affine_model[id*12 + 6] + //A[1][2] +
+									 filtered_affine_model[id*12 + 7]; //A[1][3];
+			double out3 =
+				input[id + 2*size] * filtered_affine_model[id*12 + 8] + //A[2][0] +
+				input[id + size]   * filtered_affine_model[id*12 + 9] + //A[2][1] +
+				input[id]          * filtered_affine_model[id*12 + 10] + //A[2][2] +
+									 filtered_affine_model[id*12 + 11]; // A[2][3];
+			filtered_best_output[id] = out1;
+			filtered_best_output[id + size] = out2;
+			filtered_best_output[id + 2*size] = out3;
+		}
+		return ;
+	}
+	'''
+import torch
+import numpy as np
+from PIL import Image
+from cupy.cuda import function
+from pynvrtc.compiler import Program
+from collections import namedtuple
+def smooth_local_affine(output_cpu, input_cpu, epsilon, patch, h, w, f_r, f_e):
+    # program = Program(src.encode('utf-8'), 'best_local_affine_kernel.cu'.encode('utf-8'))
+    # ptx = program.compile(['-I/usr/local/cuda/include'.encode('utf-8')])
+    program = Program(src, 'best_local_affine_kernel.cu')
+    ptx = program.compile(['-I/usr/local/cuda/include'])
+    m = function.Module()
+    m.load(bytes(ptx.encode()))
+    _reconstruction_best_kernel = m.get_function('reconstruction_best_kernel')
+    _bilateral_smooth_kernel = m.get_function('bilateral_smooth_kernel')
+    _best_local_affine_kernel = m.get_function('best_local_affine_kernel')
+    Stream = namedtuple('Stream', ['ptr'])
+    s = Stream(ptr=torch.cuda.current_stream().cuda_stream)
+    filter_radius = f_r
+    sigma1 = filter_radius / 3
+    sigma2 = f_e
+    radius = (patch - 1) / 2
+    filtered_best_output = torch.zeros(np.shape(input_cpu)).cuda()
+    affine_model =    torch.zeros((h * w, 12)).cuda()
+    filtered_affine_model =torch.zeros((h * w, 12)).cuda()
+    input_ = torch.from_numpy(input_cpu).cuda()
+    output_ = torch.from_numpy(output_cpu).cuda()
+    _best_local_affine_kernel(
+        grid=(int((h * w) / 256 + 1), 1),
+        block=(256, 1, 1),
+        args=[output_.data_ptr(), input_.data_ptr(), affine_model.data_ptr(),
+             np.int32(h), np.int32(w), np.float32(epsilon), np.int32(radius)], stream=s
+     )
+    _bilateral_smooth_kernel(
+        grid=(int((h * w) / 256 + 1), 1),
+        block=(256, 1, 1),
+        args=[affine_model.data_ptr(), filtered_affine_model.data_ptr(), input_.data_ptr(), np.int32(h), np.int32(w), np.int32(f_r), np.float32(sigma1), np.float32(sigma2)], stream=s
+    )
+    _reconstruction_best_kernel(
+        grid=(int((h * w) / 256 + 1), 1),
+        block=(256, 1, 1),
+        args=[input_.data_ptr(), filtered_affine_model.data_ptr(), filtered_best_output.data_ptr(),
+        np.int32(h), np.int32(w)], stream=s
+    )
+    numpy_filtered_best_output = filtered_best_output.cpu().numpy()
+    return numpy_filtered_best_output
+def smooth_filter(initImg, contentImg, f_radius=15,f_edge=1e-1):
+    '''
+    :param initImg: intermediate output. Either image path or PIL Image
+    :param contentImg: content image output. Either path or PIL Image
+    :return: stylized output image. PIL Image
+    '''
+    if type(initImg) == str:
+        initImg = Image.open(initImg).convert("RGB")
+    best_image_bgr = np.array(initImg, dtype=np.float32)
+    bW, bH, bC = best_image_bgr.shape
+    best_image_bgr = best_image_bgr[:, :, ::-1]
+    best_image_bgr = best_image_bgr.transpose((2, 0, 1))
+    if type(contentImg) == str:
+        contentImg = Image.open(contentImg).convert("RGB")
+    content_input = contentImg.resize((bH,bW))
+    content_input = np.array(content_input, dtype=np.float32)
+    content_input = content_input[:, :, ::-1]
+    content_input = content_input.transpose((2, 0, 1))
+    input_ = np.ascontiguousarray(content_input, dtype=np.float32) / 255.
+    _, H, W = np.shape(input_)
+    output_ = np.ascontiguousarray(best_image_bgr, dtype=np.float32) / 255.
+    best_ = smooth_local_affine(output_, input_, 1e-7, 3, H, W, f_radius, f_edge)
+    best_ = best_.transpose(1, 2, 0)
+    result = Image.fromarray(np.uint8(np.clip(best_ * 255., 0, 255.)))
+    return result