Spaces:

RayTsai-030
/

LightsOut-demo

Runtime error

App Files Files Community

Ray-1026 commited on Sep 16

Commit

a856109

1 Parent(s): ef36a49

update

Browse files

Files changed (26) hide show

.gitattributes +5 -0
SIFR_models/flare7kpp/__pycache__/model.cpython-39.pyc +0 -0
SIFR_models/flare7kpp/model.py +0 -0
SIFR_models/mfdnet/backbone.py +285 -0
SIFR_models/mfdnet/blocks.py +164 -0
SIFR_models/mfdnet/model.py +786 -0
app.py +195 -45
requirements.txt +15 -2
src/models/__pycache__/light_source_regressor.cpython-39.pyc +0 -0
src/models/__pycache__/unet.cpython-39.pyc +0 -0
src/models/light_source_regressor.py +124 -0
src/models/unet.py +129 -0
src/pipelines/__pycache__/pipeline_controlnet_outpaint.cpython-39.pyc +0 -0
src/pipelines/__pycache__/pipeline_stable_diffusion_outpaint.cpython-39.pyc +0 -0
src/pipelines/pipeline_controlnet_outpaint.py +448 -0
src/pipelines/pipeline_stable_diffusion_outpaint.py +517 -0
src/schedulers/__pycache__/scheduling_pndm.cpython-39.pyc +0 -0
src/schedulers/scheduling_pndm.py +126 -0
utils/__pycache__/dataset.cpython-39.pyc +0 -0
utils/__pycache__/utils.cpython-39.pyc +0 -0
utils/dataset.py +1304 -0
utils/loss.py +80 -0
utils/utils.py +311 -0
weights/light_outpaint_lora/pytorch_lora_weights.safetensors +3 -0
weights/light_regress/model.pth +3 -0
weights/net_g_last.pth +3 -0

.gitattributes CHANGED Viewed

@@ -35,3 +35,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 assets/* filter=lfs diff=lfs merge=lfs -text
 assets/exp.png filter=lfs diff=lfs merge=lfs -text

 *tfevents* filter=lfs diff=lfs merge=lfs -text
 assets/* filter=lfs diff=lfs merge=lfs -text
 assets/exp.png filter=lfs diff=lfs merge=lfs -text
+weights/light_outpaint_lora filter=lfs diff=lfs merge=lfs -text
+weights/light_regress filter=lfs diff=lfs merge=lfs -text
+weights/net_g_last.pth filter=lfs diff=lfs merge=lfs -text
+weights/light_outpaint_lora/pytorch_lora_weights.safetensors filter=lfs diff=lfs merge=lfs -text
+weights/light_regress/model.pth filter=lfs diff=lfs merge=lfs -text

SIFR_models/flare7kpp/__pycache__/model.cpython-39.pyc ADDED Viewed

Binary file (52.1 kB). View file

SIFR_models/flare7kpp/model.py ADDED Viewed

The diff for this file is too large to render. See raw diff

SIFR_models/mfdnet/backbone.py ADDED Viewed

	@@ -0,0 +1,285 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class LayerNormFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, weight, bias, eps):
+        ctx.eps = eps
+        N, C, H, W = x.size()
+        mu = x.mean(1, keepdim=True)
+        var = (x - mu).pow(2).mean(1, keepdim=True)
+        y = (x - mu) / (var + eps).sqrt()
+        ctx.save_for_backward(y, var, weight)
+        y = weight.view(1, C, 1, 1) * y + bias.view(1, C, 1, 1)
+        return y
+    @staticmethod
+    def backward(ctx, grad_output):
+        eps = ctx.eps
+        N, C, H, W = grad_output.size()
+        y, var, weight = ctx.saved_variables
+        g = grad_output * weight.view(1, C, 1, 1)
+        mean_g = g.mean(dim=1, keepdim=True)
+        mean_gy = (g * y).mean(dim=1, keepdim=True)
+        gx = 1.0 / torch.sqrt(var + eps) * (g - y * mean_gy - mean_g)
+        return (
+            gx,
+            (grad_output * y).sum(dim=3).sum(dim=2).sum(dim=0),
+            grad_output.sum(dim=3).sum(dim=2).sum(dim=0),
+            None,
+        )
+class LayerNorm2d(nn.Module):
+    def __init__(self, channels, eps=1e-6):
+        super(LayerNorm2d, self).__init__()
+        self.register_parameter("weight", nn.Parameter(torch.ones(channels)))
+        self.register_parameter("bias", nn.Parameter(torch.zeros(channels)))
+        self.eps = eps
+    def forward(self, x):
+        return LayerNormFunction.apply(x, self.weight, self.bias, self.eps)
+class SimpleGate(nn.Module):
+    def forward(self, x):
+        x1, x2 = x.chunk(2, dim=1)
+        return x1 * x2
+class NAFBlock(nn.Module):
+    def __init__(self, c, DW_Expand=2, FFN_Expand=2, drop_out_rate=0.0):
+        super().__init__()
+        dw_channel = c * DW_Expand
+        self.conv1 = nn.Conv2d(
+            in_channels=c,
+            out_channels=dw_channel,
+            kernel_size=1,
+            padding=0,
+            stride=1,
+            groups=1,
+            bias=True,
+        )
+        self.conv2 = nn.Conv2d(
+            in_channels=dw_channel,
+            out_channels=dw_channel,
+            kernel_size=3,
+            padding=1,
+            stride=1,
+            groups=dw_channel,
+            bias=True,
+        )
+        self.conv3 = nn.Conv2d(
+            in_channels=dw_channel // 2,
+            out_channels=c,
+            kernel_size=1,
+            padding=0,
+            stride=1,
+            groups=1,
+            bias=True,
+        )
+        # Simplified Channel Attention
+        self.sca = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(
+                in_channels=dw_channel // 2,
+                out_channels=dw_channel // 2,
+                kernel_size=1,
+                padding=0,
+                stride=1,
+                groups=1,
+                bias=True,
+            ),
+        )
+        # SimpleGate
+        self.sg = SimpleGate()
+        ffn_channel = FFN_Expand * c
+        self.conv4 = nn.Conv2d(
+            in_channels=c,
+            out_channels=ffn_channel,
+            kernel_size=1,
+            padding=0,
+            stride=1,
+            groups=1,
+            bias=True,
+        )
+        self.conv5 = nn.Conv2d(
+            in_channels=ffn_channel // 2,
+            out_channels=c,
+            kernel_size=1,
+            padding=0,
+            stride=1,
+            groups=1,
+            bias=True,
+        )
+        self.norm1 = LayerNorm2d(c)
+        self.norm2 = LayerNorm2d(c)
+        self.dropout1 = (
+            nn.Dropout(drop_out_rate) if drop_out_rate > 0.0 else nn.Identity()
+        )
+        self.dropout2 = (
+            nn.Dropout(drop_out_rate) if drop_out_rate > 0.0 else nn.Identity()
+        )
+        self.beta = nn.Parameter(torch.zeros((1, c, 1, 1)), requires_grad=True)
+        self.gamma = nn.Parameter(torch.zeros((1, c, 1, 1)), requires_grad=True)
+    def forward(self, inp):
+        x = inp
+        x = self.norm1(x)
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.sg(x)
+        x = x * self.sca(x)
+        x = self.conv3(x)
+        x = self.dropout1(x)
+        y = inp + x * self.beta
+        x = self.conv4(self.norm2(y))
+        x = self.sg(x)
+        x = self.conv5(x)
+        x = self.dropout2(x)
+        return y + x * self.gamma
+class NAFNet(nn.Module):
+    def __init__(
+        self,
+        img_channel=3,
+        width=32,
+        middle_blk_num=12,
+        enc_blk_nums=[2, 2, 4, 8],
+        dec_blk_nums=[2, 2, 2, 2],
+    ):
+        super().__init__()
+        self.intro = nn.Conv2d(
+            in_channels=img_channel,
+            out_channels=width,
+            kernel_size=3,
+            padding=1,
+            stride=1,
+            groups=1,
+            bias=True,
+        )
+        self.ending = nn.Conv2d(
+            in_channels=width,
+            out_channels=img_channel,
+            kernel_size=3,
+            padding=1,
+            stride=1,
+            groups=1,
+            bias=True,
+        )
+        self.encoders = nn.ModuleList()
+        self.decoders = nn.ModuleList()
+        self.middle_blks = nn.ModuleList()
+        self.ups = nn.ModuleList()
+        self.downs = nn.ModuleList()
+        chan = width
+        for num in enc_blk_nums:
+            self.encoders.append(nn.Sequential(*[NAFBlock(chan) for _ in range(num)]))
+            self.downs.append(nn.Conv2d(chan, 2 * chan, 2, 2))
+            chan = chan * 2
+        self.middle_blks = nn.Sequential(
+            *[NAFBlock(chan) for _ in range(middle_blk_num)]
+        )
+        for num in dec_blk_nums:
+            self.ups.append(
+                nn.Sequential(
+                    nn.Conv2d(chan, chan * 2, 1, bias=False), nn.PixelShuffle(2)
+                )
+            )
+            chan = chan // 2
+            self.decoders.append(nn.Sequential(*[NAFBlock(chan) for _ in range(num)]))
+        self.padder_size = 2 ** len(self.encoders)
+    def forward(self, inp):
+        B, C, H, W = inp.shape
+        inp = self.check_image_size(inp)
+        x = self.intro(inp)
+        encs = []
+        for encoder, down in zip(self.encoders, self.downs):
+            x = encoder(x)
+            encs.append(x)
+            x = down(x)
+        x = self.middle_blks(x)
+        for decoder, up, enc_skip in zip(self.decoders, self.ups, encs[::-1]):
+            x = up(x)
+            x = x + enc_skip
+            x = decoder(x)
+        x = self.ending(x)
+        x = x + inp
+        return x[:, :, :H, :W]
+    def check_image_size(self, x):
+        _, _, h, w = x.size()
+        mod_pad_h = (self.padder_size - h % self.padder_size) % self.padder_size
+        mod_pad_w = (self.padder_size - w % self.padder_size) % self.padder_size
+        x = F.pad(x, (0, mod_pad_w, 0, mod_pad_h))
+        return x
+if __name__ == "__main__":
+    img_channel = 3
+    width = 32
+    enc_blks = [2, 2, 4, 8]
+    middle_blk_num = 12
+    dec_blks = [2, 2, 2, 2]
+    print(
+        "enc blks",
+        enc_blks,
+        "middle blk num",
+        middle_blk_num,
+        "dec blks",
+        dec_blks,
+        "width",
+        width,
+    )
+    # using('start . ')
+    model = NAFNet(
+        img_channel=img_channel,
+        width=width,
+        middle_blk_num=middle_blk_num,
+        enc_blk_nums=enc_blks,
+        dec_blk_nums=dec_blks,
+    ).cuda()
+    model.eval()
+    input = torch.randn(1, 3, 15, 22).cuda()
+    # input = torch.randn(1, 3, 32, 32)
+    y = model(input)
+    print(y.size())

SIFR_models/mfdnet/blocks.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class ConvLayer(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        dilation=1,
+        bias=True,
+        groups=1,
+        norm="in",
+        nonlinear="relu",
+    ):
+        super(ConvLayer, self).__init__()
+        reflection_padding = (kernel_size + (dilation - 1) * (kernel_size - 1)) // 2
+        self.reflection_pad = nn.ReflectionPad2d(reflection_padding)
+        self.conv2d = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            groups=groups,
+            bias=bias,
+            dilation=dilation,
+        )
+        self.norm = norm
+        self.nonlinear = nonlinear
+        if norm == "bn":
+            self.normalization = nn.BatchNorm2d(out_channels)
+        elif norm == "in":
+            self.normalization = nn.InstanceNorm2d(out_channels, affine=False)
+        else:
+            self.normalization = None
+        if nonlinear == "relu":
+            self.activation = nn.ReLU(inplace=True)
+        elif nonlinear == "leakyrelu":
+            self.activation = nn.LeakyReLU(0.2)
+        elif nonlinear == "PReLU":
+            self.activation = nn.PReLU()
+        else:
+            self.activation = None
+    def forward(self, x):
+        out = self.conv2d(self.reflection_pad(x))
+        if self.normalization is not None:
+            out = self.normalization(out)
+        if self.activation is not None:
+            out = self.activation(out)
+        return out
+class Aggreation(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=3):
+        super(Aggreation, self).__init__()
+        self.attention = SelfAttention(in_channels, k=8, nonlinear="relu")
+        self.conv = ConvLayer(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=1,
+            dilation=1,
+            nonlinear="leakyrelu",
+            norm=None,
+        )
+    def forward(self, x):
+        return self.conv(self.attention(x))
+class SelfAttention(nn.Module):
+    def __init__(self, channels, k, nonlinear="relu"):
+        super(SelfAttention, self).__init__()
+        self.channels = channels
+        self.k = k
+        self.nonlinear = nonlinear
+        self.linear1 = nn.Linear(channels, channels // k)
+        self.linear2 = nn.Linear(channels // k, channels)
+        self.global_pooling = nn.AdaptiveAvgPool2d((1, 1))
+        if nonlinear == "relu":
+            self.activation = nn.ReLU(inplace=True)
+        elif nonlinear == "leakyrelu":
+            self.activation = nn.LeakyReLU(0.2)
+        elif nonlinear == "PReLU":
+            self.activation = nn.PReLU()
+        else:
+            raise ValueError
+    def attention(self, x):
+        N, C, H, W = x.size()
+        out = torch.flatten(self.global_pooling(x), 1)
+        out = self.activation(self.linear1(out))
+        out = torch.sigmoid(self.linear2(out)).view(N, C, 1, 1)
+        return out.mul(x)
+    def forward(self, x):
+        return self.attention(x)
+class SPP(nn.Module):
+    def __init__(
+        self, in_channels, out_channels, num_layers=4, interpolation_type="bilinear"
+    ):
+        super(SPP, self).__init__()
+        self.conv = nn.ModuleList()
+        self.num_layers = num_layers
+        self.interpolation_type = interpolation_type
+        for _ in range(self.num_layers):
+            self.conv.append(
+                ConvLayer(
+                    in_channels,
+                    in_channels,
+                    kernel_size=1,
+                    stride=1,
+                    dilation=1,
+                    nonlinear="leakyrelu",
+                    norm=None,
+                )
+            )
+        self.fusion = ConvLayer(
+            (in_channels * (self.num_layers + 1)),
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            norm="False",
+            nonlinear="leakyrelu",
+        )
+    def forward(self, x):
+        N, C, H, W = x.size()
+        out = []
+        for level in range(self.num_layers):
+            out.append(
+                F.interpolate(
+                    self.conv[level](
+                        F.avg_pool2d(
+                            x,
+                            kernel_size=2 * 2 ** (level + 1),
+                            stride=2 * 2 ** (level + 1),
+                            padding=2 * 2 ** (level + 1) % 2,
+                        )
+                    ),
+                    size=(H, W),
+                    mode=self.interpolation_type,
+                )
+            )
+        out.append(x)
+        return self.fusion(torch.cat(out, dim=1))

SIFR_models/mfdnet/model.py ADDED Viewed

	@@ -0,0 +1,786 @@

+import numbers
+import einops
+from einops import rearrange
+from .backbone import *
+from .blocks import *
+class ResidualBlock(nn.Module):
+    def __init__(self, in_features):
+        super(ResidualBlock, self).__init__()
+        self.block = nn.Sequential(
+            nn.Conv2d(in_features, in_features, 3, padding=1),
+            nn.LeakyReLU(),
+            nn.Conv2d(in_features, in_features, 3, padding=1),
+        )
+    def forward(self, x):
+        return x + self.block(x)
+def gauss_kernel(channels=3):
+    kernel = torch.tensor(
+        [
+            [1.0, 4.0, 6.0, 4.0, 1],
+            [4.0, 16.0, 24.0, 16.0, 4.0],
+            [6.0, 24.0, 36.0, 24.0, 6.0],
+            [4.0, 16.0, 24.0, 16.0, 4.0],
+            [1.0, 4.0, 6.0, 4.0, 1.0],
+        ]
+    )
+    kernel /= 256.0
+    kernel = kernel.repeat(channels, 1, 1, 1)
+    return kernel
+class LapPyramidConv(nn.Module):
+    def __init__(self, num_high=4):
+        super(LapPyramidConv, self).__init__()
+        self.num_high = num_high
+        self.kernel = gauss_kernel()
+    def downsample(self, x):
+        return x[:, :, ::2, ::2]
+    def upsample(self, x):
+        cc = torch.cat(
+            [
+                x,
+                torch.zeros(
+                    x.shape[0], x.shape[1], x.shape[2], x.shape[3], device=x.device
+                ),
+            ],
+            dim=3,
+        )
+        cc = cc.view(x.shape[0], x.shape[1], x.shape[2] * 2, x.shape[3])
+        cc = cc.permute(0, 1, 3, 2)
+        cc = torch.cat(
+            [
+                cc,
+                torch.zeros(
+                    x.shape[0], x.shape[1], x.shape[3], x.shape[2] * 2, device=x.device
+                ),
+            ],
+            dim=3,
+        )
+        cc = cc.view(x.shape[0], x.shape[1], x.shape[3] * 2, x.shape[2] * 2)
+        x_up = cc.permute(0, 1, 3, 2)
+        return self.conv_gauss(x_up, 4 * self.kernel)
+    def conv_gauss(self, img, kernel):
+        # 对最后两个维度进行填充，（左右上下）
+        img = torch.nn.functional.pad(img, (2, 2, 2, 2), mode="reflect")
+        # 分组卷积
+        out = torch.nn.functional.conv2d(
+            img, kernel.to(img.device), groups=img.shape[1]
+        )
+        return out
+    def pyramid_decom(self, img):
+        current = img
+        pyr = []
+        for _ in range(self.num_high):
+            filtered = self.conv_gauss(current, self.kernel)
+            down = self.downsample(filtered)
+            up = self.upsample(down)
+            if up.shape[2] != current.shape[2] or up.shape[3] != current.shape[3]:
+                up = nn.functional.interpolate(
+                    up, size=(current.shape[2], current.shape[3])
+                )
+            diff = current - up
+            pyr.append(diff)
+            current = down
+        pyr.append(current)
+        return pyr
+    def pyramid_recons(self, pyr):
+        image = pyr[-1]
+        for level in reversed(pyr[:-1]):
+            up = self.upsample(image)
+            if up.shape[2] != level.shape[2] or up.shape[3] != level.shape[3]:
+                up = nn.functional.interpolate(
+                    up, size=(level.shape[2], level.shape[3])
+                )
+            image = up + level
+        return image
+class TransHigh(nn.Module):
+    def __init__(self, num_residual_blocks, num_high=3):
+        super(TransHigh, self).__init__()
+        self.num_high = num_high
+        blocks = [nn.Conv2d(9, 64, 3, padding=1), nn.LeakyReLU()]
+        for _ in range(num_residual_blocks):
+            blocks += [ResidualBlock(64)]
+        blocks += [nn.Conv2d(64, 3, 3, padding=1)]
+        self.model = nn.Sequential(*blocks)
+        channels = 3
+        # Stage1
+        self.block1_1 = ConvLayer(
+            in_channels=channels,
+            out_channels=channels,
+            kernel_size=3,
+            stride=1,
+            dilation=2,
+            norm=None,
+            nonlinear="leakyrelu",
+        )
+        self.block1_2 = ConvLayer(
+            in_channels=channels,
+            out_channels=channels,
+            kernel_size=3,
+            stride=1,
+            dilation=4,
+            norm=None,
+            nonlinear="leakyrelu",
+        )
+        self.aggreation1_rgb = Aggreation(
+            in_channels=channels * 3, out_channels=channels
+        )
+        # Stage2
+        self.block2_1 = ConvLayer(
+            in_channels=channels,
+            out_channels=channels,
+            kernel_size=3,
+            stride=1,
+            dilation=8,
+            norm=None,
+            nonlinear="leakyrelu",
+        )
+        self.block2_2 = ConvLayer(
+            in_channels=channels,
+            out_channels=channels,
+            kernel_size=3,
+            stride=1,
+            dilation=16,
+            norm=None,
+            nonlinear="leakyrelu",
+        )
+        self.aggreation2_rgb = Aggreation(
+            in_channels=channels * 3, out_channels=channels
+        )
+        # Stage3
+        self.block3_1 = ConvLayer(
+            in_channels=channels,
+            out_channels=channels,
+            kernel_size=3,
+            stride=1,
+            dilation=32,
+            norm=None,
+            nonlinear="leakyrelu",
+        )
+        self.block3_2 = ConvLayer(
+            in_channels=channels,
+            out_channels=channels,
+            kernel_size=3,
+            stride=1,
+            dilation=64,
+            norm=None,
+            nonlinear="leakyrelu",
+        )
+        self.aggreation3_rgb = Aggreation(
+            in_channels=channels * 3, out_channels=channels
+        )
+        # self.block_3 = NAFNet(middle_blk_num=2, enc_blk_nums=[
+        # 1,1], dec_blk_nums=[1,1])
+        self.trans_mask_block_1 = nn.Sequential(
+            nn.Conv2d(3, 16, 1), nn.LeakyReLU(), nn.Conv2d(16, 3, 1)
+        )
+        self.trans_mask_block_2 = nn.Sequential(
+            nn.Conv2d(3, 16, 1), nn.LeakyReLU(), nn.Conv2d(16, 3, 1)
+        )
+        # self.trans_mask_block = NAFNet(
+        # middle_blk_num=1, enc_blk_nums=[1], dec_blk_nums=[1])
+        # Stage3
+        self.spp_img = SPP(
+            in_channels=channels,
+            out_channels=channels,
+            num_layers=4,
+            interpolation_type="bicubic",
+        )
+        self.block4_1 = nn.Conv2d(
+            in_channels=channels, out_channels=3, kernel_size=1, stride=1
+        )
+    def forward(self, x, pyr_original, fake_low):
+        pyr_result = [fake_low]
+        mask = self.model(x)
+        mask = nn.functional.interpolate(
+            mask, size=(pyr_original[-2].shape[2], pyr_original[-2].shape[3])
+        )
+        mask = self.trans_mask_block_1(mask)
+        result_highfreq = torch.mul(pyr_original[-2], mask) + pyr_original[-2]
+        # result_highfreq = self.block_3(result_highfreq)
+        out1_1 = self.block1_1(result_highfreq)
+        out1_2 = self.block1_2(out1_1)
+        agg1_rgb = self.aggreation1_rgb(
+            torch.cat((result_highfreq, out1_1, out1_2), dim=1)
+        )
+        pyr_result.append(agg1_rgb)
+        mask = nn.functional.interpolate(
+            mask, size=(pyr_original[-3].shape[2], pyr_original[-3].shape[3])
+        )
+        mask = self.trans_mask_block_2(mask)
+        result_highfreq = torch.mul(pyr_original[-3], mask) + pyr_original[-3]
+        # result_highfreq = self.block_3(result_highfreq)
+        out2_1 = self.block2_1(result_highfreq)
+        out2_2 = self.block2_2(out2_1)
+        agg2_rgb = self.aggreation2_rgb(
+            torch.cat((result_highfreq, out2_1, out2_2), dim=1)
+        )
+        out3_1 = self.block3_1(agg2_rgb)
+        out3_2 = self.block3_2(out3_1)
+        agg3_rgb = self.aggreation3_rgb(torch.cat((agg2_rgb, out3_1, out3_2), dim=1))
+        spp_rgb = self.spp_img(agg3_rgb)
+        out_rgb = self.block4_1(spp_rgb)
+        pyr_result.append(out_rgb)
+        pyr_result.reverse()
+        return pyr_result
+# Layer Norm
+def to_3d(x):
+    return rearrange(x, "b c h w -> b (h w) c")
+def to_4d(x, h, w):
+    return rearrange(x, "b (h w) c -> b c h w", h=h, w=w)
+class BiasFree_LayerNorm(nn.Module):
+    def __init__(self, normalized_shape):
+        super(BiasFree_LayerNorm, self).__init__()
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = (normalized_shape,)
+        normalized_shape = torch.Size(normalized_shape)
+        assert len(normalized_shape) == 1
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.normalized_shape = normalized_shape
+    def forward(self, x):
+        sigma = x.var(-1, keepdim=True, unbiased=False)
+        return x / torch.sqrt(sigma + 1e-5) * self.weight
+class WithBias_LayerNorm(nn.Module):
+    def __init__(self, normalized_shape):
+        super(WithBias_LayerNorm, self).__init__()
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = (normalized_shape,)
+        normalized_shape = torch.Size(normalized_shape)
+        assert len(normalized_shape) == 1
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.normalized_shape = normalized_shape
+    def forward(self, x):
+        mu = x.mean(-1, keepdim=True)
+        sigma = x.var(-1, keepdim=True, unbiased=False)
+        return (x - mu) / torch.sqrt(sigma + 1e-5) * self.weight + self.bias
+class LayerNorm(nn.Module):
+    def __init__(self, dim, LayerNorm_type):
+        super(LayerNorm, self).__init__()
+        if LayerNorm_type == "BiasFree":
+            self.body = BiasFree_LayerNorm(dim)
+        else:
+            self.body = WithBias_LayerNorm(dim)
+    def forward(self, x):
+        h, w = x.shape[-2:]
+        return to_4d(self.body(to_3d(x)), h, w)
+# Axis-based Multi-head Self-Attention
+class NextAttentionImplZ(nn.Module):
+    def __init__(self, num_dims, num_heads, bias) -> None:
+        super().__init__()
+        self.num_dims = num_dims
+        self.num_heads = num_heads
+        self.q1 = nn.Conv2d(num_dims, num_dims * 3, kernel_size=1, bias=bias)
+        self.q2 = nn.Conv2d(
+            num_dims * 3,
+            num_dims * 3,
+            kernel_size=3,
+            padding=1,
+            groups=num_dims * 3,
+            bias=bias,
+        )
+        self.q3 = nn.Conv2d(
+            num_dims * 3,
+            num_dims * 3,
+            kernel_size=3,
+            padding=1,
+            groups=num_dims * 3,
+            bias=bias,
+        )
+        self.fac = nn.Parameter(torch.ones(1))
+        self.fin = nn.Conv2d(num_dims, num_dims, kernel_size=1, bias=bias)
+        return
+    def forward(self, x):
+        # x: [n, c, h, w]
+        n, c, h, w = x.size()
+        n_heads, dim_head = self.num_heads, c // self.num_heads
+        def reshape(x):
+            return einops.rearrange(
+                x, "n (nh dh) h w -> (n nh h) w dh", nh=n_heads, dh=dim_head
+            )
+        qkv = self.q3(self.q2(self.q1(x)))
+        q, k, v = map(reshape, qkv.chunk(3, dim=1))
+        q = F.normalize(q, dim=-1)
+        k = F.normalize(k, dim=-1)
+        # fac = dim_head ** -0.5
+        res = k.transpose(-2, -1)
+        res = torch.matmul(q, res) * self.fac
+        res = torch.softmax(res, dim=-1)
+        res = torch.matmul(res, v)
+        res = einops.rearrange(
+            res, "(n nh h) w dh -> n (nh dh) h w", nh=n_heads, dh=dim_head, n=n, h=h
+        )
+        res = self.fin(res)
+        return res
+# Axis-based Multi-head Self-Attention (row and col attention)
+class NextAttentionZ(nn.Module):
+    def __init__(self, num_dims, num_heads=1, bias=True) -> None:
+        super().__init__()
+        assert num_dims % num_heads == 0
+        self.num_dims = num_dims
+        self.num_heads = num_heads
+        self.row_att = NextAttentionImplZ(num_dims, num_heads, bias)
+        self.col_att = NextAttentionImplZ(num_dims, num_heads, bias)
+        return
+    def forward(self, x: torch.Tensor):
+        assert len(x.size()) == 4
+        x = self.row_att(x)
+        x = x.transpose(-2, -1)
+        x = self.col_att(x)
+        x = x.transpose(-2, -1)
+        return x
+# Dual Gated Feed-Forward Networ
+class FeedForward(nn.Module):
+    def __init__(self, dim, ffn_expansion_factor, bias):
+        super(FeedForward, self).__init__()
+        hidden_features = int(dim * ffn_expansion_factor)
+        self.project_in = nn.Conv2d(dim, hidden_features * 2, kernel_size=1, bias=bias)
+        self.dwconv = nn.Conv2d(
+            hidden_features * 2,
+            hidden_features * 2,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            groups=hidden_features * 2,
+            bias=bias,
+        )
+        self.project_out = nn.Conv2d(hidden_features, dim, kernel_size=1, bias=bias)
+    def forward(self, x):
+        x = self.project_in(x)
+        x1, x2 = self.dwconv(x).chunk(2, dim=1)
+        x = F.gelu(x2) * x1 + F.gelu(x1) * x2
+        x = self.project_out(x)
+        return x
+# Axis-based Transformer Block
+class TransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads=1,
+        ffn_expansion_factor=2.66,
+        bias=True,
+        LayerNorm_type="WithBias",
+    ):
+        super(TransformerBlock, self).__init__()
+        self.norm1 = LayerNorm(dim, LayerNorm_type)
+        self.attn = NextAttentionZ(dim, num_heads)
+        self.norm2 = LayerNorm(dim, LayerNorm_type)
+        self.ffn = FeedForward(dim, ffn_expansion_factor, bias)
+    def forward(self, x):
+        x = x + self.attn(self.norm1(x))
+        x = x + self.ffn(self.norm2(x))
+        return x
+##########################################################################
+# Overlapped image patch embedding with 3x3 Conv
+class OverlapPatchEmbed(nn.Module):
+    def __init__(self, in_c=3, embed_dim=48, bias=False):
+        super(OverlapPatchEmbed, self).__init__()
+        self.proj = nn.Conv2d(
+            in_c, embed_dim, kernel_size=3, stride=1, padding=1, bias=bias
+        )
+    def forward(self, x):
+        x = self.proj(x)
+        return x
+##########################################################################
+# Resizing modules
+class Downsample(nn.Module):
+    def __init__(self, n_feat):
+        super(Downsample, self).__init__()
+        self.body = nn.Sequential(
+            nn.Conv2d(
+                n_feat, n_feat // 2, kernel_size=3, stride=1, padding=1, bias=False
+            ),
+            nn.PixelUnshuffle(2),
+        )
+    def forward(self, x):
+        return self.body(x)
+class Upsample(nn.Module):
+    def __init__(self, n_feat):
+        super(Upsample, self).__init__()
+        self.body = nn.Sequential(
+            nn.Conv2d(
+                n_feat, n_feat * 2, kernel_size=3, stride=1, padding=1, bias=False
+            ),
+            nn.PixelShuffle(2),
+        )
+    def forward(self, x):
+        return self.body(x)
+# Cross-layer Attention Fusion Block
+class LAM_Module_v2(nn.Module):
+    """Layer attention module"""
+    def __init__(self, in_dim, bias=True):
+        super(LAM_Module_v2, self).__init__()
+        self.chanel_in = in_dim
+        self.temperature = nn.Parameter(torch.ones(1))
+        self.qkv = nn.Conv2d(
+            self.chanel_in, self.chanel_in * 3, kernel_size=1, bias=bias
+        )
+        self.qkv_dwconv = nn.Conv2d(
+            self.chanel_in * 3,
+            self.chanel_in * 3,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            groups=self.chanel_in * 3,
+            bias=bias,
+        )
+        self.project_out = nn.Conv2d(
+            self.chanel_in, self.chanel_in, kernel_size=1, bias=bias
+        )
+    def forward(self, x):
+        """
+        inputs :
+            x : input feature maps( B X N X C X H X W)
+        returns :
+            out : attention value + input feature
+            attention: B X N X N
+        """
+        m_batchsize, N, C, height, width = x.size()
+        x_input = x.view(m_batchsize, N * C, height, width)
+        qkv = self.qkv_dwconv(self.qkv(x_input))
+        q, k, v = qkv.chunk(3, dim=1)
+        q = q.view(m_batchsize, N, -1)
+        k = k.view(m_batchsize, N, -1)
+        v = v.view(m_batchsize, N, -1)
+        q = torch.nn.functional.normalize(q, dim=-1)
+        k = torch.nn.functional.normalize(k, dim=-1)
+        attn = (q @ k.transpose(-2, -1)) * self.temperature
+        attn = attn.softmax(dim=-1)
+        out_1 = attn @ v
+        out_1 = out_1.view(m_batchsize, -1, height, width)
+        out_1 = self.project_out(out_1)
+        out_1 = out_1.view(m_batchsize, N, C, height, width)
+        out = out_1 + x
+        out = out.view(m_batchsize, -1, height, width)
+        return out
+##########################################################################
+# ---------- LLFormer -----------------------
+class Backbone(nn.Module):
+    def __init__(
+        self,
+        inp_channels=3,
+        out_channels=3,
+        dim=3,
+        num_blocks=[1, 2, 4, 8],
+        num_refinement_blocks=1,
+        heads=[1, 2, 4, 8],
+        ffn_expansion_factor=2.66,
+        bias=False,
+        LayerNorm_type="WithBias",
+        attention=True,
+    ):
+        super(Backbone, self).__init__()
+        self.patch_embed = OverlapPatchEmbed(inp_channels, dim)
+        self.encoder_1 = nn.Sequential(
+            *[
+                TransformerBlock(
+                    dim=dim,
+                    num_heads=heads[0],
+                    ffn_expansion_factor=ffn_expansion_factor,
+                    bias=bias,
+                    LayerNorm_type=LayerNorm_type,
+                )
+                for _ in range(num_blocks[0])
+            ]
+        )
+        self.encoder_2 = nn.Sequential(
+            *[
+                TransformerBlock(
+                    dim=int(dim),
+                    num_heads=heads[0],
+                    ffn_expansion_factor=ffn_expansion_factor,
+                    bias=bias,
+                    LayerNorm_type=LayerNorm_type,
+                )
+                for _ in range(num_blocks[0])
+            ]
+        )
+        self.encoder_3 = nn.Sequential(
+            *[
+                TransformerBlock(
+                    dim=int(dim),
+                    num_heads=heads[0],
+                    ffn_expansion_factor=ffn_expansion_factor,
+                    bias=bias,
+                    LayerNorm_type=LayerNorm_type,
+                )
+                for _ in range(num_blocks[0])
+            ]
+        )
+        self.layer_fussion = LAM_Module_v2(in_dim=int(dim * 3))
+        self.conv_fuss = nn.Conv2d(int(dim * 3), int(dim), kernel_size=1, bias=bias)
+        # self.latent = nn.Sequential(*[
+        # TransformerBlock(dim=int(dim), num_heads=heads[0], ffn_expansion_factor=ffn_expansion_factor, bias=bias,
+        # LayerNorm_type=LayerNorm_type) for _ in range(num_blocks[0])])
+        # self.trans_low = NAFNet()
+        # self.coefficient_1_0 = nn.Parameter(torch.ones(
+        # (2, int(int(dim)))), requires_grad=attention)
+        self.latent_1 = nn.Sequential(
+            *[
+                TransformerBlock(
+                    dim=int(dim),
+                    num_heads=heads[0],
+                    ffn_expansion_factor=ffn_expansion_factor,
+                    bias=bias,
+                    LayerNorm_type=LayerNorm_type,
+                )
+                for _ in range(num_blocks[0])
+            ]
+        )
+        """
+        self.latent_2 = nn.Sequential(*[
+            TransformerBlock(dim=int(dim), num_heads=heads[0], ffn_expansion_factor=ffn_expansion_factor, bias=bias,
+                             LayerNorm_type=LayerNorm_type) for _ in range(num_blocks[0])])
+        """
+        self.trans_low_1 = NAFNet(
+            middle_blk_num=10, enc_blk_nums=[1, 2, 4], dec_blk_nums=[4, 2, 1]
+        )
+        # self.trans_low_2 = NAFNet()
+        self.coefficient_1_0 = nn.Parameter(
+            torch.ones((2, int(int(dim)))), requires_grad=attention
+        )
+        # self.coefficient_2_0 = nn.Parameter(torch.ones(
+        # (2, int(int(dim)))), requires_grad=attention)
+        self.refinement_1 = nn.Sequential(
+            *[
+                TransformerBlock(
+                    dim=int(dim),
+                    num_heads=heads[0],
+                    ffn_expansion_factor=ffn_expansion_factor,
+                    bias=bias,
+                    LayerNorm_type=LayerNorm_type,
+                )
+                for _ in range(num_refinement_blocks)
+            ]
+        )
+        self.refinement_2 = nn.Sequential(
+            *[
+                TransformerBlock(
+                    dim=int(dim),
+                    num_heads=heads[0],
+                    ffn_expansion_factor=ffn_expansion_factor,
+                    bias=bias,
+                    LayerNorm_type=LayerNorm_type,
+                )
+                for _ in range(num_refinement_blocks)
+            ]
+        )
+        self.refinement_3 = nn.Sequential(
+            *[
+                TransformerBlock(
+                    dim=int(dim),
+                    num_heads=heads[0],
+                    ffn_expansion_factor=ffn_expansion_factor,
+                    bias=bias,
+                    LayerNorm_type=LayerNorm_type,
+                )
+                for _ in range(num_refinement_blocks)
+            ]
+        )
+        self.layer_fussion_2 = LAM_Module_v2(in_dim=int(dim * 3))
+        self.conv_fuss_2 = nn.Conv2d(int(dim * 3), int(dim), kernel_size=1, bias=bias)
+        self.output = nn.Conv2d(
+            int(dim), out_channels, kernel_size=3, stride=1, padding=1, bias=bias
+        )
+    def forward(self, inp):
+        inp_enc_encoder1 = self.patch_embed(inp)
+        out_enc_encoder1 = self.encoder_1(inp_enc_encoder1)
+        out_enc_encoder2 = self.encoder_2(out_enc_encoder1)
+        out_enc_encoder3 = self.encoder_3(out_enc_encoder2)
+        inp_fusion_123 = torch.cat(
+            [
+                out_enc_encoder1.unsqueeze(1),
+                out_enc_encoder2.unsqueeze(1),
+                out_enc_encoder3.unsqueeze(1),
+            ],
+            dim=1,
+        )
+        out_fusion_123 = self.layer_fussion(inp_fusion_123)
+        out_fusion_123 = self.conv_fuss(out_fusion_123)
+        # out_enc = self.trans_low(out_fusion_123)
+        # out_fusion_123 = self.latent(out_fusion_123)
+        # out = self.coefficient_1_0[0, :][None, :, None, None] * out_fusion_123 + self.coefficient_1_0[1, :][None, :,None, None] * out_enc
+        out_enc_1 = self.trans_low_1(out_fusion_123)
+        out_fusion_123_1 = self.latent_1(out_fusion_123)
+        out = (
+            self.coefficient_1_0[0, :][None, :, None, None] * out_fusion_123_1
+            + self.coefficient_1_0[1, :][None, :, None, None] * out_enc_1
+        )
+        # out_enc_2 = self.trans_low_2(out)
+        # out_fusion_123_2 = self.latent_2(out)
+        # out = self.coefficient_2_0[0, :][None, :, None, None] * out_fusion_123_2 + self.coefficient_2_0[1, :][None, :,None, None] * out_enc_2
+        out_1 = self.refinement_1(out)
+        out_2 = self.refinement_2(out_1)
+        out_3 = self.refinement_3(out_2)
+        inp_fusion = torch.cat(
+            [out_1.unsqueeze(1), out_2.unsqueeze(1), out_3.unsqueeze(1)], dim=1
+        )
+        out_fusion_123 = self.layer_fussion_2(inp_fusion)
+        out = self.conv_fuss_2(out_fusion_123)
+        result = self.output(out)
+        return result
+class Model(nn.Module):
+    def __init__(self, depth=2):
+        super(Model, self).__init__()
+        self.backbone = Backbone()
+        self.lap_pyramid = LapPyramidConv(depth)
+        self.trans_high = TransHigh(3, num_high=depth)
+    def forward(self, inp):
+        pyr_inp = self.lap_pyramid.pyramid_decom(img=inp)
+        out_low = self.backbone(pyr_inp[-1])
+        inp_up = nn.functional.interpolate(
+            pyr_inp[-1], size=(pyr_inp[-2].shape[2], pyr_inp[-2].shape[3])
+        )
+        out_up = nn.functional.interpolate(
+            out_low, size=(pyr_inp[-2].shape[2], pyr_inp[-2].shape[3])
+        )
+        high_with_low = torch.cat([pyr_inp[-2], inp_up, out_up], 1)
+        pyr_inp_trans = self.trans_high(high_with_low, pyr_inp, out_low)
+        result = self.lap_pyramid.pyramid_recons(pyr_inp_trans)
+        return result
+if __name__ == "__main__":
+    tensor = torch.randn(1, 3, 1024, 1024).cuda()
+    model = Model().cuda()
+    output = model(tensor)
+    print(output.shape)

app.py CHANGED Viewed

@@ -2,13 +2,30 @@ import numpy as np
 import gradio as gr
 import numpy as np
 import random
-# import torch
 import spaces
 import os
 import base64
 import json
 from PIL import Image
 intro = """
 <div style="text-align:center">
@@ -142,19 +159,44 @@ def encode_image(pil_image):
 #         raise Exception(f"Failed to post: {response}")
-# # --- Model Loading ---
-# dtype = torch.bfloat16
-device = "cpu"
-# # Load the model pipeline
-# pipe = QwenImageEditPipeline.from_pretrained(
-#     "Qwen/Qwen-Image-Edit", torch_dtype=dtype
-# ).to(device)
-# pipe.transformer.__class__ = QwenImageTransformer2DModel
-# pipe.transformer.set_attn_processor(QwenDoubleStreamAttnProcessorFA3())
-# # --- Ahead-of-time compilation ---
-# optimize_pipeline_(pipe, image=Image.new("RGB", (1024, 1024)), prompt="prompt")
 # --- UI Constants and Helpers ---
 MAX_SEED = np.iinfo(np.int32).max
@@ -164,47 +206,137 @@ MAX_SEED = np.iinfo(np.int32).max
 @spaces.GPU(duration=120)
 def infer(
     image,
-    seed=120,
-    true_guidance_scale=4.0,
     num_inference_steps=50,
     progress=gr.Progress(track_tqdm=True),
 ):
     """
-    Generates an image using the local Qwen-Image diffusers pipeline.
     """
-    # Hardcode the negative prompt as requested
-    negative_prompt = " "
-    # if randomize_seed:
-    #     seed = 42
-        # seed = random.randint(0, MAX_SEED)
-    # Set up the generator for reproducibility
-    # generator = torch.Generator(device=device).manual_seed(seed)
-    # print(f"Calling pipeline with prompt: '{prompt}'")
-    print(f"Negative Prompt: '{negative_prompt}'")
-    print(
-        f"Seed: {seed}, Steps: {num_inference_steps}, Guidance: {true_guidance_scale}"
     )
-    # if rewrite_prompt:
-    #     # prompt = polish_prompt(prompt, image)
-    #     print(f"Rewritten Prompt: {prompt}")
-    # Generate the image
-    # images = pipe(
-    #     image,
-    #     prompt=prompt,
-    #     negative_prompt=negative_prompt,
-    #     num_inference_steps=num_inference_steps,
-    #     generator=generator,
-    #     true_cfg_scale=true_guidance_scale,
-    #     num_images_per_prompt=1,
-    # ).images
-    images = [Image.open("exp.png")]
-    return images[0], images[0], seed
 # --- Examples and UI Layout ---
@@ -243,6 +375,20 @@ with gr.Blocks(css=css) as demo:
                 value=42,
             )
             # randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
             with gr.Row():
@@ -306,8 +452,12 @@ with gr.Blocks(css=css) as demo:
             seed,
             true_guidance_scale,
             num_inference_steps,
         ],
-        outputs=[outpainted_result, flarefree_result, seed],
     )
 if __name__ == "__main__":

 import gradio as gr
 import numpy as np
 import random
+import torch
 import spaces
 import os
 import base64
 import json
+import torchvision
 from PIL import Image
+from diffusers import ControlNetModel, DPMSolverMultistepScheduler
+from transformers import Blip2Processor, Blip2ForConditionalGeneration
+from src.pipelines.pipeline_stable_diffusion_outpaint import OutpaintPipeline
+from src.pipelines.pipeline_controlnet_outpaint import ControlNetOutpaintPipeline
+from src.schedulers.scheduling_pndm import CustomScheduler
+from src.models.unet import U_Net
+from src.models.light_source_regressor import LightSourceRegressor
+from utils.dataset import HFCustomImageLoader
+from utils.utils import (
+    blend_with_alpha,
+    load_mfdnet_checkpoint,
+    predict_flare_from_6_channel,
+    predict_flare_from_3_channel,
+    blend_light_source,
+)
+from SIFR_models.flare7kpp.model import Uformer
 intro = """
 <div style="text-align:center">
 #         raise Exception(f"Failed to post: {response}")
+## --- Model Loading --- ##
+device = "cuda" if torch.cuda.is_available() else "cpu"
+dtype = torch.float16 if device == "cuda" else torch.float32
+print(f"Using device: {device}")
+# controlnet
+controlnet = ControlNetModel.from_pretrained(
+    "RayTsai-030/LightsOut-controlnet", torch_dtype=dtype
+)
+# outpainter
+pipe = ControlNetOutpaintPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-2-inpainting", controlnet=controlnet, torch_dtype=dtype
+).to(device)
+pipe.scheduler = CustomScheduler.from_config(pipe.scheduler.config)
+pipe.unet.load_attn_procs("./weights/light_outpaint_lora", use_safetensors=True)
+# blip
+processor = Blip2Processor.from_pretrained(
+    "Salesforce/blip2-opt-2.7b", revision="51572668da0eb669e01a189dc22abe6088589a24"
+)
+blip2 = Blip2ForConditionalGeneration.from_pretrained(
+    "Salesforce/blip2-opt-2.7b",
+    torch_dtype=dtype,
+    revision="51572668da0eb669e01a189dc22abe6088589a24",
+)
+blip2 = blip2.to(device)
+# light regressor
+lsr_module = LightSourceRegressor()
+ckpt = torch.load("./weights/light_regress/model.pth")
+lsr_module.load_state_dict(ckpt["model"])
+lsr_module.to(device)
+lsr_module.eval()
+# SIFR model
+sifr_model = Uformer(img_size=512, img_ch=3, output_ch=6).to(device)
+sifr_model.load_state_dict(torch.load("./weights/net_g_last.pth"))
 # --- UI Constants and Helpers ---
 MAX_SEED = np.iinfo(np.int32).max
 @spaces.GPU(duration=120)
 def infer(
     image,
+    seed=42,
+    cfg=7.5,
     num_inference_steps=50,
+    left_outpaint=64,
+    right_outpaint=64,
+    up_outpaint=64,
+    down_outpaint=64,
     progress=gr.Progress(track_tqdm=True),
 ):
     """
+    Generates an image
     """
+    # dataset
+    dataset = HFCustomImageLoader(image, left_outpaint, right_outpaint, up_outpaint, down_outpaint)
+    data = dataset[0]
+    # generator
+    generator = torch.Generator(device=device).manual_seed(seed)
+    # transformation
+    transform = torchvision.transforms.Compose(
+        [
+            torchvision.transforms.ToTensor(),
+            torchvision.transforms.Normalize(mean=[0.5], std=[0.5]),
+        ]
+    )
+    sifr_transform = torchvision.transforms.Compose(
+        [
+            torchvision.transforms.ToTensor(),
+            torchvision.transforms.Resize((512, 512)),
+        ]
+    )
+    threshold = 0.5
+    with torch.no_grad():
+        input_img = data["input_img"]
+        input_img = transform(input_img).unsqueeze(0).to(device)
+        pred_mask = lsr_module.forward_render(input_img)
+        pred_mask = (pred_mask > threshold).float()
+        if pred_mask.device != "cpu":
+            pred_mask = pred_mask.cpu()
+        pred_mask = pred_mask.numpy()
+        data["control_img"] = Image.fromarray(
+            (pred_mask[0, 0] * 255).astype(np.uint8)
+        )
+    # print("Finish light source detection...")
+    # prepare text prompt
+    inputs = processor(data["blip_img"], return_tensors="pt").to(
+        device=device, dtype=dtype
+    )
+    generate_id = blip2.generate(**inputs, max_new_tokens=20)
+    generated_text = processor.batch_decode(generate_id, skip_special_tokens=True)[
+        0
+    ].strip()
+    generated_text += (
+        ", dynamic lighting, intense light source, prominent lens flare, best quality, high resolution, masterpiece, intricate details"
+        # ", full light sources with lens flare, best quality, high resolution"
     )
+    # print(f"Generated text prompt: {generated_text}")
+    # Blur mask
+    # data["mask_img"] = data["mask_img"].filter(ImageFilter.GaussianBlur(15))
+    # denoise
+    outpaint_result = pipe(
+        prompt=generated_text,
+        negative_prompt="NSFW, (word:1.5), watermark, blurry, missing body, amputation, mutilation",
+        image=data["input_img"],
+        mask_image=data["mask_img"],
+        control_image=data["control_img"],
+        num_inference_steps=num_inference_steps,
+        guidance_scale=cfg,
+        generator=generator,
+        repeat_time=4,
+    ).images[0]
+    # save result
+    outpaint_result = np.array(outpaint_result)
+    input_img = np.array(data["input_img"])
+    box = data["box"]
+    input_img2 = outpaint_result.copy()
+    input_img2[box[2] : box[3] + 1, box[0] : box[1] + 1] = input_img[
+        box[2] : box[3] + 1, box[0] : box[1] + 1
+    ]
+    outpaint_result = blend_with_alpha(outpaint_result, input_img2, box, blur_size=31)
+    outpaint_result = Image.fromarray(outpaint_result.astype(np.uint8))
+    # print("Finish outpainting...")
+    # flare removal
+    img = sifr_transform(outpaint_result).unsqueeze(0).cuda()
+    with torch.no_grad():
+        output_img = sifr_model(img)
+        gamma = torch.Tensor([2.2])
+        # flare7k++
+        deflare_result, _, _ = predict_flare_from_6_channel(output_img, gamma)
+        # # mfdnet
+        # flare_mask = torch.zeros_like(img)
+        # deflare_img, _ = predict_flare_from_3_channel(
+        #     output_img, flare_mask, output_img, img, img, gamma
+        # )
+        # deflare_img = blend_light_source(img, deflare_img, 0.999)
+    if deflare_result.device != "cpu":
+        deflare_result = deflare_result.cpu()
+    deflare_result = deflare_result.squeeze(0).permute(1, 2, 0).numpy()
+    deflare_result = np.clip(deflare_result, 0.0, 1.0)
+    deflare_result = (deflare_result * 255).astype(np.uint8)
+    deflare_result = deflare_result[box[2] : box[3] + 1, box[0] : box[1] + 1, :]
+    deflare_result = Image.fromarray(deflare_result).resize((512, 512), Image.LANCZOS)
+    # print("Finish flare removal...")
+    return outpaint_result, deflare_result
 # --- Examples and UI Layout ---
                 value=42,
             )
+            with gr.Column():
+                left_outpaint = gr.Slider(
+                    label="Left outpaint (px)", minimum=0, maximum=128, step=1, value=64
+                )
+                right_outpaint = gr.Slider(
+                    label="Right outpaint (px)", minimum=0, maximum=128, step=1, value=64
+                )
+                up_outpaint = gr.Slider(
+                    label="Up outpaint (px)", minimum=0, maximum=128, step=1, value=64
+                )
+                down_outpaint = gr.Slider(
+                    label="Down outpaint (px)", minimum=0, maximum=128, step=1, value=64
+                )
             # randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
             with gr.Row():
             seed,
             true_guidance_scale,
             num_inference_steps,
+            left_outpaint,
+            right_outpaint,
+            up_outpaint,
+            down_outpaint,
         ],
+        outputs=[outpainted_result, flarefree_result],
     )
 if __name__ == "__main__":

requirements.txt CHANGED Viewed

@@ -1,2 +1,15 @@
-gradio
-pydantic==2.10.6

+gradio==4.44.1
+pydantic==2.10.6
+accelerate==0.21.0
+diffusers==0.23.0
+einops==0.8.0
+huggingface-hub==0.25.2
+imageio==2.36.0
+numpy==1.24.1
+opencv-python==4.10.0.84
+scikit-image==0.24.0
+timm==1.0.11
+transformers==4.36.0
+xformers==0.0.20
+spaces
+pillow

src/models/__pycache__/light_source_regressor.cpython-39.pyc ADDED Viewed

Binary file (3.41 kB). View file

src/models/__pycache__/unet.cpython-39.pyc ADDED Viewed

Binary file (3.75 kB). View file

src/models/light_source_regressor.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import torch
+from torch import nn
+from torch.nn import init
+from torchvision.models import resnet34, resnet50
+import torchvision.models.vision_transformer as vit
+class LightSourceRegressor(nn.Module):
+    def __init__(self, num_lights=4, alpha=2.0, beta=8.0, **kwargs):
+        super(LightSourceRegressor, self).__init__()
+        self.num_lights = num_lights
+        self.alpha = alpha
+        self.beta = beta
+        self.model = resnet34(pretrained=True)
+        # self.model = resnet50(pretrained=True)
+        # self.model = vit.vit_b_16(pretrained=True)
+        self.init_resnet()
+        # self.init_vit()
+        self.xyr_mlp = nn.Sequential(
+            nn.Linear(self.last_dim, 3 * self.num_lights),
+        )
+        self.p_mlp = nn.Sequential(
+            nn.Linear(self.last_dim, self.num_lights),
+            nn.Sigmoid(),  # ensure p is in [0, 1]
+        )
+    def init_resnet(self):
+        self.last_dim = self.model.fc.in_features
+        self.model.fc = nn.Identity()
+    def init_vit(self):
+        self.model.image_size = 512
+        old_pos_embed = self.model.encoder.pos_embedding
+        num_patches_old = (224 // 16) ** 2
+        num_patches_new = (512 // 16) ** 2
+        if num_patches_new != num_patches_old:
+            old_pos_embed = old_pos_embed[:, 1:]
+            old_pos_embed = nn.functional.interpolate(
+                old_pos_embed.permute(0, 2, 1), size=(num_patches_new,), mode="linear"
+            )
+            old_pos_embed = old_pos_embed.permute(0, 2, 1)
+            # new positional embedding
+            self.model.encoder.pos_embedding = nn.Parameter(
+                torch.cat(
+                    [self.model.encoder.pos_embedding[:, :1], old_pos_embed], dim=1
+                )
+            )
+        # num_classes = 4 * self.num_lights  # x, y, r, p
+        # self.model.heads.head = nn.Linear(self.model.hidden_dim, num_classes)
+        # remove the head
+        self.last_dim = self.model.hidden_dim
+        self.model.heads.head = nn.Identity()
+    def forward(self, x, height=512, width=512, smoothness=0.1, merge=False):
+        _x = self.model(x)  # [B, last_dim]
+        _xyr = self.xyr_mlp(_x)
+        _xyr = _xyr.view(-1, self.num_lights, 3)
+        _p = self.p_mlp(_x)
+        _p = _p.view(-1, self.num_lights)
+        output = torch.cat([_xyr, _p.unsqueeze(-1)], dim=-1)
+        return output
+    def forward_render(self, x, height=512, width=512, smoothness=0.1, merge=False):
+        _x = self.forward(x)
+        _xy = _x[:, :, :2]
+        _r = _x[:, :, 2]
+        _p = _x[:, :, 3]
+        masks = None
+        masks_merge = None
+        for b in range(_x.size(0)):
+            x, y, r = _xy[b, :, 0] * width, _xy[b, :, 1] * width, _r[b] * width / 2
+            p = _p[b]
+            mask_list = []
+            for i in range(self.num_lights):
+                if r[i] < 0 or r[i] > width or p[i] < 0.5:
+                    continue
+                y_coords, x_coords = torch.meshgrid(
+                    torch.arange(height, device=x.device),
+                    torch.arange(width, device=x.device),
+                    indexing="ij",
+                )
+                distances = torch.sqrt((x_coords - x[i]) ** 2 + (y_coords - y[i]) ** 2)
+                mask_i = torch.sigmoid(smoothness * (r[i] - distances))
+                mask_list.append(mask_i)
+            if len(mask_list) == 0:
+                _mask_merge = torch.zeros(1, 1, height, width, device=x.device)
+            else:
+                _mask_merge = torch.stack(mask_list, dim=0).sum(dim=0).unsqueeze(0)
+                _mask_merge = _mask_merge.unsqueeze(0)
+            masks_merge = (
+                _mask_merge
+                if masks_merge is None
+                else torch.cat([masks_merge, _mask_merge], dim=0)
+            )
+        masks_merge = torch.clamp(masks_merge, 0, 1)
+        return masks_merge  # [B, 1, H, W]
+if __name__ == "__main__":
+    # pydiffvg.set_use_gpu(torch.cuda.is_available())
+    model = LightSourceRegressor(num_lights=4).cuda()
+    x = torch.randn(8, 3, 512, 512, device="cuda")
+    y = model.forward_render(x)
+    print(y.shape)

src/models/unet.py ADDED Viewed

	@@ -0,0 +1,129 @@

+from torch import nn
+from torch.nn import init
+import torch
+import torch.nn.functional as F
+class conv_block(nn.Module):
+    def __init__(self, ch_in, ch_out):
+        super(conv_block, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(ch_in, ch_out, kernel_size=3, stride=1, padding=1, bias=True),
+            nn.BatchNorm2d(ch_out),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(ch_out, ch_out, kernel_size=3, stride=1, padding=1, bias=True),
+            nn.BatchNorm2d(ch_out),
+            nn.ReLU(inplace=True),
+        )
+    def forward(self, x):
+        x = self.conv(x)
+        return x
+class up_conv(nn.Module):
+    def __init__(self, ch_in, ch_out):
+        super(up_conv, self).__init__()
+        self.up = nn.Sequential(
+            nn.Upsample(scale_factor=2),
+            nn.Conv2d(ch_in, ch_out, kernel_size=3, stride=1, padding=1, bias=True),
+            nn.BatchNorm2d(ch_out),
+            nn.ReLU(inplace=True),
+        )
+    def forward(self, x):
+        x = self.up(x)
+        return x
+class U_Net(nn.Module):
+    def __init__(self, img_ch=3, output_ch=1, multi_stage=False):
+        super(U_Net, self).__init__()
+        self.Maxpool = nn.MaxPool2d(kernel_size=2, stride=2)
+        self.Conv1 = conv_block(ch_in=img_ch, ch_out=64)
+        self.Conv2 = conv_block(ch_in=64, ch_out=128)
+        self.Conv3 = conv_block(ch_in=128, ch_out=256)
+        self.Conv4 = conv_block(ch_in=256, ch_out=512)
+        self.Conv5 = conv_block(ch_in=512, ch_out=1024)
+        self.Up5 = up_conv(ch_in=1024, ch_out=512)
+        self.Up_conv5 = conv_block(ch_in=1024, ch_out=512)
+        self.Up4 = up_conv(ch_in=512, ch_out=256)
+        self.Up_conv4 = conv_block(ch_in=512, ch_out=256)
+        self.Up3 = up_conv(ch_in=256, ch_out=128)
+        self.Up_conv3 = conv_block(ch_in=256, ch_out=128)
+        self.Up2 = up_conv(ch_in=128, ch_out=64)
+        self.Up_conv2 = conv_block(ch_in=128, ch_out=64)
+        self.Conv_1x1 = nn.Conv2d(64, output_ch, kernel_size=1, stride=1, padding=0)
+        self.activation = nn.Sequential(nn.Sigmoid())
+        # init_weights(self)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        init_type = "normal"
+        gain = 0.02
+        classname = m.__class__.__name__
+        if hasattr(m, "weight") and (
+            classname.find("Conv") != -1 or classname.find("Linear") != -1
+        ):
+            if init_type == "normal":
+                init.normal_(m.weight.data, 0.0, gain)
+            elif init_type == "xavier":
+                init.xavier_normal_(m.weight.data, gain=gain)
+            elif init_type == "kaiming":
+                init.kaiming_normal_(m.weight.data, a=0, mode="fan_in")
+            elif init_type == "orthogonal":
+                init.orthogonal_(m.weight.data, gain=gain)
+            else:
+                raise NotImplementedError(
+                    "initialization method [%s] is not implemented" % init_type
+                )
+            if hasattr(m, "bias") and m.bias is not None:
+                init.constant_(m.bias.data, 0.0)
+        elif classname.find("BatchNorm2d") != -1:
+            init.normal_(m.weight.data, 1.0, gain)
+            init.constant_(m.bias.data, 0.0)
+    def forward(self, x):
+        # encoding path
+        x1 = self.Conv1(x)
+        x2 = self.Maxpool(x1)
+        x2 = self.Conv2(x2)
+        x3 = self.Maxpool(x2)
+        x3 = self.Conv3(x3)
+        x4 = self.Maxpool(x3)
+        x4 = self.Conv4(x4)
+        x5 = self.Maxpool(x4)
+        x5 = self.Conv5(x5)
+        # decoding + concat path
+        d5 = self.Up5(x5)
+        d5 = torch.cat((x4, d5), dim=1)
+        d5 = self.Up_conv5(d5)
+        d4 = self.Up4(d5)
+        d4 = torch.cat((x3, d4), dim=1)
+        d4 = self.Up_conv4(d4)
+        d3 = self.Up3(d4)
+        d3 = torch.cat((x2, d3), dim=1)
+        d3 = self.Up_conv3(d3)
+        d2 = self.Up2(d3)
+        d2 = torch.cat((x1, d2), dim=1)
+        d2 = self.Up_conv2(d2)
+        d1 = self.Conv_1x1(d2)
+        d1 = self.activation(d1)
+        return d1

src/pipelines/__pycache__/pipeline_controlnet_outpaint.cpython-39.pyc ADDED Viewed

Binary file (7.49 kB). View file

src/pipelines/__pycache__/pipeline_stable_diffusion_outpaint.cpython-39.pyc ADDED Viewed

Binary file (16 kB). View file

src/pipelines/pipeline_controlnet_outpaint.py ADDED Viewed

	@@ -0,0 +1,448 @@

+import torch
+from typing import List, Union, Dict, Any, Callable, Optional, Tuple
+from diffusers import StableDiffusionControlNetInpaintPipeline, ControlNetModel
+from diffusers.utils.torch_utils import randn_tensor, is_compiled_module
+from diffusers.models import ControlNetModel
+from diffusers.pipelines.controlnet import MultiControlNetModel
+from diffusers.image_processor import PipelineImageInput
+from diffusers.pipelines.stable_diffusion.pipeline_output import (
+    StableDiffusionPipelineOutput,
+)
+class ControlNetOutpaintPipeline(StableDiffusionControlNetInpaintPipeline):
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        mask_image: PipelineImageInput = None,
+        control_image: PipelineImageInput = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        strength: float = 1.0,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 0.5,
+        guess_mode: bool = False,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
+        clip_skip: Optional[int] = None,
+        ## add
+        repeat_time: int = 4,
+        ##
+        **kwargs: Any,
+    ):
+        r""" """
+        controlnet = (
+            self.controlnet._orig_mod
+            if is_compiled_module(self.controlnet)
+            else self.controlnet
+        )
+        # self.init_filter()
+        # align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(
+            control_guidance_end, list
+        ):
+            control_guidance_start = len(control_guidance_end) * [
+                control_guidance_start
+            ]
+        elif not isinstance(control_guidance_end, list) and isinstance(
+            control_guidance_start, list
+        ):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(
+            control_guidance_end, list
+        ):
+            mult = (
+                len(controlnet.nets)
+                if isinstance(controlnet, MultiControlNetModel)
+                else 1
+            )
+            control_guidance_start, control_guidance_end = mult * [
+                control_guidance_start
+            ], mult * [control_guidance_end]
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            control_image,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            controlnet_conditioning_scale,
+            control_guidance_start,
+            control_guidance_end,
+        )
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        if isinstance(controlnet, MultiControlNetModel) and isinstance(
+            controlnet_conditioning_scale, float
+        ):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(
+                controlnet.nets
+            )
+        global_pool_conditions = (
+            controlnet.config.global_pool_conditions
+            if isinstance(controlnet, ControlNetModel)
+            else controlnet.nets[0].config.global_pool_conditions
+        )
+        guess_mode = guess_mode or global_pool_conditions
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None)
+            if cross_attention_kwargs is not None
+            else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+        # 4. Prepare image
+        if isinstance(controlnet, ControlNetModel):
+            control_image = self.prepare_control_image(
+                image=control_image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=controlnet.dtype,
+                do_classifier_free_guidance=do_classifier_free_guidance,
+                guess_mode=guess_mode,
+            )
+        elif isinstance(controlnet, MultiControlNetModel):
+            control_images = []
+            for control_image_ in control_image:
+                control_image_ = self.prepare_control_image(
+                    image=control_image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=controlnet.dtype,
+                    do_classifier_free_guidance=do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+                control_images.append(control_image_)
+            control_image = control_images
+        else:
+            assert False
+        # 4. Preprocess mask and image - resizes image and mask w.r.t height and width
+        init_image = self.image_processor.preprocess(image, height=height, width=width)
+        init_image = init_image.to(dtype=torch.float32)
+        mask = self.mask_processor.preprocess(mask_image, height=height, width=width)
+        masked_image = init_image * (mask < 0.5)
+        _, _, height, width = init_image.shape
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(
+            num_inference_steps=num_inference_steps, strength=strength, device=device
+        )
+        # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
+        is_strength_max = strength == 1.0
+        # 6. Prepare latent variables
+        num_channels_latents = self.vae.config.latent_channels
+        num_channels_unet = self.unet.config.in_channels
+        return_image_latents = True
+        latents_outputs = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            image=init_image,
+            timestep=latent_timestep,
+            is_strength_max=is_strength_max,
+            return_noise=True,
+            return_image_latents=return_image_latents,
+        )
+        if return_image_latents:
+            latents, noise, image_latents = latents_outputs
+        else:
+            latents, noise = latents_outputs
+        # 7. Prepare mask latent variables
+        mask, masked_image_latents = self.prepare_mask_latents(
+            mask,
+            masked_image,
+            batch_size * num_images_per_prompt,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            do_classifier_free_guidance,
+        )
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7.1 Create tensor stating which controlnets to keep
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            controlnet_keep.append(
+                keeps[0] if isinstance(controlnet, ControlNetModel) else keeps
+            )
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            # for i, t in enumerate(timesteps):
+            ## modify
+            i = 0
+            reinject = repeat_time
+            while i < len(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                t = timesteps[i]
+                latent_model_input = (
+                    torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                )
+                latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, t
+                )
+                # controlnet(s) inference
+                if guess_mode and do_classifier_free_guidance:
+                    # Infer ControlNet only for the conditional batch.
+                    control_model_input = latents
+                    control_model_input = self.scheduler.scale_model_input(
+                        control_model_input, t
+                    )
+                    controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+                else:
+                    control_model_input = latent_model_input
+                    controlnet_prompt_embeds = prompt_embeds
+                if isinstance(controlnet_keep[i], list):
+                    cond_scale = [
+                        c * s
+                        for c, s in zip(
+                            controlnet_conditioning_scale, controlnet_keep[i]
+                        )
+                    ]
+                else:
+                    controlnet_cond_scale = controlnet_conditioning_scale
+                    if isinstance(controlnet_cond_scale, list):
+                        controlnet_cond_scale = controlnet_cond_scale[0]
+                    cond_scale = controlnet_cond_scale * controlnet_keep[i]
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    control_model_input,
+                    t,
+                    encoder_hidden_states=controlnet_prompt_embeds,
+                    controlnet_cond=control_image,
+                    conditioning_scale=cond_scale,
+                    guess_mode=guess_mode,
+                    return_dict=False,
+                )
+                if guess_mode and do_classifier_free_guidance:
+                    # Infered ControlNet only for the conditional batch.
+                    # To apply the output of ControlNet to both the unconditional and conditional batches,
+                    # add 0 to the unconditional batch to keep it unchanged.
+                    down_block_res_samples = [
+                        torch.cat([torch.zeros_like(d), d])
+                        for d in down_block_res_samples
+                    ]
+                    mid_block_res_sample = torch.cat(
+                        [
+                            torch.zeros_like(mid_block_res_sample),
+                            mid_block_res_sample,
+                        ]
+                    )
+                # predict the noise residual
+                if num_channels_unet == 9:
+                    latent_model_input = torch.cat(
+                        [latent_model_input, mask, masked_image_latents], dim=1
+                    )
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                    return_dict=False,
+                )[0]
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (
+                        noise_pred_text - noise_pred_uncond
+                    )
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, **extra_step_kwargs, return_dict=False
+                )[0]
+                if num_channels_unet == 4:
+                    init_latents_proper = image_latents
+                    if do_classifier_free_guidance:
+                        init_mask, _ = mask.chunk(2)
+                    else:
+                        init_mask = mask
+                    if i < len(timesteps) - 1:
+                        noise_timestep = timesteps[i + 1]
+                        init_latents_proper = self.scheduler.add_noise(
+                            init_latents_proper,
+                            noise,
+                            torch.tensor([noise_timestep]),
+                        )
+                    latents = (
+                        1 - init_mask
+                    ) * init_latents_proper + init_mask * latents
+                i += 1
+                ## noise reinjection
+                if i > 0 and i < int(len(timesteps) - 1) and reinject > 0:
+                    current_timestep = timesteps[i]
+                    target_timestep = timesteps[i - 1]
+                    new_nosie = torch.randn_like(latents)
+                    # step back x_t-1 -> x_t
+                    latents = self.scheduler.step_back(
+                        latents,
+                        new_nosie,
+                        torch.tensor([current_timestep]),
+                        torch.tensor([target_timestep]),
+                    )
+                    i -= 1
+                    reinject -= 1
+                else:
+                    # reinject = repeat_time
+                    # schedule
+                    if i >= int(len(timesteps) * 0.8):
+                        reinject = 0
+                    elif i >= int(len(timesteps) * 0.6):
+                        reinject = max(0, repeat_time - 3)
+                    elif i >= int(len(timesteps) * 0.4):
+                        reinject = max(0, repeat_time - 2)
+                    elif i >= int(len(timesteps) * 0.2):
+                        reinject = max(0, repeat_time - 1)
+                    else:
+                        reinject = repeat_time
+                    # call the callback, if provided
+                    if i == len(timesteps) - 1 or (
+                        (i + 1) > num_warmup_steps
+                        and (i + 1) % self.scheduler.order == 0
+                    ):
+                        progress_bar.update()
+                        if callback is not None and i % callback_steps == 0:
+                            step_idx = i // getattr(self.scheduler, "order", 1)
+                            callback(step_idx, t, latents)
+        # If we do sequential model offloading, let's offload unet and controlnet
+        # manually for max memory savings
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.unet.to("cpu")
+            self.controlnet.to("cpu")
+            torch.cuda.empty_cache()
+        if not output_type == "latent":
+            image = self.vae.decode(
+                latents / self.vae.config.scaling_factor,
+                return_dict=False,
+                generator=generator,
+            )[0]
+            image, has_nsfw_concept = self.run_safety_checker(
+                image, device, prompt_embeds.dtype
+            )
+        else:
+            image = latents
+            has_nsfw_concept = None
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+        image = self.image_processor.postprocess(
+            image, output_type=output_type, do_denormalize=do_denormalize
+        )
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(
+            images=image, nsfw_content_detected=has_nsfw_concept
+        )

src/pipelines/pipeline_stable_diffusion_outpaint.py ADDED Viewed

	@@ -0,0 +1,517 @@

+import torch
+from typing import List, Union, Dict, Any, Callable, Optional, Tuple
+from diffusers import StableDiffusionInpaintPipeline
+from diffusers.utils import make_image_grid, load_image, deprecate
+from diffusers.models import AsymmetricAutoencoderKL
+from diffusers.image_processor import PipelineImageInput
+from diffusers.pipelines.stable_diffusion.pipeline_output import (
+    StableDiffusionPipelineOutput,
+)
+class OutpaintPipeline(StableDiffusionInpaintPipeline):
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        mask_image: PipelineImageInput = None,
+        control_image: PipelineImageInput = None,
+        masked_image_latents: torch.FloatTensor = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        strength: float = 1.0,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: int = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        ## add
+        repeat_time: int = 4,
+        ##
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to be inpainted (which parts of the image to
+                be masked out with `mask_image` and repainted according to `prompt`). For both numpy array and pytorch
+                tensor, the expected value range is between `[0, 1]` If it's a tensor or a list or tensors, the
+                expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a list of arrays, the
+                expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image latents as `image`, but
+                if passing latents directly it is not encoded again.
+            mask_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask
+                are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a
+                single channel (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one
+                color channel (L) instead of 3, so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B,
+                H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W,
+                1)`, or `(H, W)`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            strength (`float`, *optional*, defaults to 1.0):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeine class.
+        Examples:
+        ```py
+        >>> import PIL
+        >>> import requests
+        >>> import torch
+        >>> from io import BytesIO
+        >>> from diffusers import StableDiffusionInpaintPipeline
+        >>> def download_image(url):
+        ...     response = requests.get(url)
+        ...     return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+        >>> img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+        >>> mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+        >>> init_image = download_image(img_url).resize((512, 512))
+        >>> mask_image = download_image(mask_url).resize((512, 512))
+        >>> pipe = StableDiffusionInpaintPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+        >>> prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+        >>> image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
+        ```
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # 1. Check inputs
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            strength,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None)
+            if cross_attention_kwargs is not None
+            else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+        # 4. set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(
+            num_inference_steps=num_inference_steps, strength=strength, device=device
+        )
+        # check that number of inference steps is not < 1 - as this doesn't make sense
+        if num_inference_steps < 1:
+            raise ValueError(
+                f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
+                f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
+            )
+        # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
+        is_strength_max = strength == 1.0
+        # 5. Preprocess mask and image
+        init_image = self.image_processor.preprocess(image, height=height, width=width)
+        init_image = init_image.to(dtype=torch.float32)
+        # 6. Prepare latent variables
+        num_channels_latents = self.vae.config.latent_channels
+        num_channels_unet = self.unet.config.in_channels
+        return_image_latents = num_channels_unet == 4
+        latents_outputs = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            image=init_image,
+            timestep=latent_timestep,
+            is_strength_max=is_strength_max,
+            return_noise=True,
+            return_image_latents=return_image_latents,
+        )
+        if return_image_latents:
+            latents, noise, image_latents = latents_outputs
+        else:
+            latents, noise = latents_outputs
+        # 7. Prepare mask latent variables
+        mask_condition = self.mask_processor.preprocess(
+            mask_image, height=height, width=width
+        )
+        if masked_image_latents is None:
+            masked_image = init_image * (mask_condition < 0.5)
+        else:
+            masked_image = masked_image_latents
+        mask, masked_image_latents = self.prepare_mask_latents(
+            mask_condition,
+            masked_image,
+            batch_size * num_images_per_prompt,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            self.do_classifier_free_guidance,
+        )
+        # 8. Check that sizes of mask, masked image and latents match
+        if num_channels_unet == 9:
+            # default case for runwayml/stable-diffusion-inpainting
+            num_channels_mask = mask.shape[1]
+            num_channels_masked_image = masked_image_latents.shape[1]
+            if (
+                num_channels_latents + num_channels_mask + num_channels_masked_image
+                != self.unet.config.in_channels
+            ):
+                raise ValueError(
+                    f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                    f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                    f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
+                    f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+                    " `pipeline.unet` or your `mask_image` or `image` input."
+                )
+        elif num_channels_unet != 4:
+            raise ValueError(
+                f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {self.unet.config.in_channels}."
+            )
+        # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 9.5 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(
+                batch_size * num_images_per_prompt
+            )
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+        # 10. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            # for i in range(len(timesteps)):
+            ## modify
+            i = 0
+            reinject = repeat_time
+            while i < len(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = (
+                    torch.cat([latents] * 2)
+                    if self.do_classifier_free_guidance
+                    else latents
+                )
+                # concat latents, mask, masked_image_latents in the channel dimension
+                latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, timesteps[i]
+                )
+                if num_channels_unet == 9:
+                    latent_model_input = torch.cat(
+                        [latent_model_input, mask, masked_image_latents], dim=1
+                    )
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    timesteps[i],
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (
+                        noise_pred_text - noise_pred_uncond
+                    )
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred,
+                    timesteps[i],
+                    latents,
+                    **extra_step_kwargs,
+                    return_dict=False,
+                )[0]
+                if num_channels_unet == 4:
+                    init_latents_proper = image_latents
+                    if self.do_classifier_free_guidance:
+                        init_mask, _ = mask.chunk(2)
+                    else:
+                        init_mask = mask
+                    if i < len(timesteps) - 1:
+                        noise_timestep = timesteps[i + 1]
+                        init_latents_proper = self.scheduler.add_noise(
+                            init_latents_proper, noise, torch.tensor([noise_timestep])
+                        )
+                    latents = (
+                        1 - init_mask
+                    ) * init_latents_proper + init_mask * latents
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(
+                        self, i, timesteps[i], callback_kwargs
+                    )
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop(
+                        "negative_prompt_embeds", negative_prompt_embeds
+                    )
+                    mask = callback_outputs.pop("mask", mask)
+                    masked_image_latents = callback_outputs.pop(
+                        "masked_image_latents", masked_image_latents
+                    )
+                # # call the callback, if provided
+                # if i == len(timesteps) - 1 or (
+                #     (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                # ):
+                #     progress_bar.update()
+                #     if callback is not None and i % callback_steps == 0:
+                #         step_idx = i // getattr(self.scheduler, "order", 1)
+                #         callback(step_idx, timesteps[i], latents)
+                i += 1
+                ## noise reinjection
+                if i > 0 and i < int(len(timesteps) - 1) and reinject != 0:
+                    current_timestep = timesteps[i]
+                    target_timestep = timesteps[i - 1]
+                    new_nosie = torch.randn_like(latents)
+                    # step back x_t-1 -> x_t
+                    latents = self.scheduler.step_back(
+                        latents,
+                        new_nosie,
+                        torch.tensor([current_timestep]),
+                        torch.tensor([target_timestep]),
+                    )
+                    i -= 1
+                    reinject -= 1
+                else:
+                    # reinject = repeat_time
+                    # schedule
+                    if i >= int(len(timesteps) * 0.85):
+                        reinject = 0
+                    elif i >= int(len(timesteps) * 0.8):
+                        reinject = 1
+                    elif i >= int(len(timesteps) * 0.7):
+                        reinject = 2
+                    elif i >= int(len(timesteps) * 0.5):
+                        reinject = 3
+                    else:
+                        reinject = 4
+                    # call the callback, if provided
+                    if i == len(timesteps) - 1 or (
+                        (i + 1) > num_warmup_steps
+                        and (i + 1) % self.scheduler.order == 0
+                    ):
+                        progress_bar.update()
+                        if callback is not None and i % callback_steps == 0:
+                            step_idx = i // getattr(self.scheduler, "order", 1)
+                            callback(step_idx, timesteps[i], latents)
+        if not output_type == "latent":
+            condition_kwargs = {}
+            if isinstance(self.vae, AsymmetricAutoencoderKL):
+                init_image = init_image.to(
+                    device=device, dtype=masked_image_latents.dtype
+                )
+                init_image_condition = init_image.clone()
+                init_image = self._encode_vae_image(init_image, generator=generator)
+                mask_condition = mask_condition.to(
+                    device=device, dtype=masked_image_latents.dtype
+                )
+                condition_kwargs = {
+                    "image": init_image_condition,
+                    "mask": mask_condition,
+                }
+            image = self.vae.decode(
+                latents / self.vae.config.scaling_factor,
+                return_dict=False,
+                generator=generator,
+                **condition_kwargs,
+            )[0]
+            image, has_nsfw_concept = self.run_safety_checker(
+                image, device, prompt_embeds.dtype
+            )
+        else:
+            image = latents
+            has_nsfw_concept = None
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+        image = self.image_processor.postprocess(
+            image, output_type=output_type, do_denormalize=do_denormalize
+        )
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(
+            images=image, nsfw_content_detected=has_nsfw_concept
+        )

src/schedulers/__pycache__/scheduling_pndm.cpython-39.pyc ADDED Viewed

Binary file (3.73 kB). View file

src/schedulers/scheduling_pndm.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import torch
+from typing import List, Optional, Tuple, Union
+from diffusers import PNDMScheduler
+from diffusers.schedulers.scheduling_utils import SchedulerOutput
+class CustomScheduler(PNDMScheduler):
+    def step_plms(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        sample: torch.FloatTensor,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
+        the linear multistep method. It performs one forward pass multiple times to approximate the solution.
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple.
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+        if not self.config.skip_prk_steps and len(self.ets) < 3:
+            raise ValueError(
+                f"{self.__class__} can only be run AFTER scheduler has been run "
+                "in 'prk' mode for at least 12 iterations "
+                "See: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_pndm.py "
+                "for more information."
+            )
+        prev_timestep = (
+            timestep - self.config.num_train_timesteps // self.num_inference_steps
+        )
+        if self.counter != 1:
+            self.ets = self.ets[-3:]
+            self.ets.append(model_output)
+        else:
+            prev_timestep = timestep
+            timestep = (
+                timestep + self.config.num_train_timesteps // self.num_inference_steps
+            )
+        if len(self.ets) == 1 and self.counter == 0:
+            model_output = model_output
+            self.cur_sample = sample
+        elif len(self.ets) == 1 and self.counter == 1:
+            model_output = (model_output + self.ets[-1]) / 2
+            sample = self.cur_sample
+            # self.cur_sample = None
+        elif len(self.ets) == 2:
+            model_output = (3 * self.ets[-1] - self.ets[-2]) / 2
+        elif len(self.ets) == 3:
+            model_output = (
+                23 * self.ets[-1] - 16 * self.ets[-2] + 5 * self.ets[-3]
+            ) / 12
+        else:
+            model_output = (1 / 24) * (
+                55 * self.ets[-1]
+                - 59 * self.ets[-2]
+                + 37 * self.ets[-3]
+                - 9 * self.ets[-4]
+            )
+        prev_sample = self._get_prev_sample(
+            sample, timestep, prev_timestep, model_output
+        )
+        self.counter += 1
+        if not return_dict:
+            return (prev_sample,)
+        return SchedulerOutput(prev_sample=prev_sample)
+    def step_back(
+        self,
+        current_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        current_timesteps: torch.IntTensor,
+        target_timesteps: torch.IntTensor,
+    ):
+        """Custom function for stepping back in the diffusion process."""
+        assert current_timesteps <= target_timesteps
+        alphas_cumprod = self.alphas_cumprod.to(
+            device=current_samples.device, dtype=current_samples.dtype
+        )
+        target_timesteps = target_timesteps.to(current_samples.device)
+        current_timesteps = current_timesteps.to(current_samples.device)
+        alpha_prod_target = alphas_cumprod[target_timesteps]
+        alpha_prod_target = alpha_prod_target.flatten()
+        alpha_prod_current = alphas_cumprod[current_timesteps]
+        alpha_prod_current = alpha_prod_current.flatten()
+        alpha_prod = alpha_prod_target / alpha_prod_current
+        sqrt_alpha_prod = alpha_prod**0.5
+        sqrt_one_minus_alpha_prod = (1 - alpha_prod) ** 0.5
+        while len(sqrt_alpha_prod.shape) < len(current_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+        while len(sqrt_one_minus_alpha_prod.shape) < len(current_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+        noisy_samples = (
+            sqrt_alpha_prod * current_samples + sqrt_one_minus_alpha_prod * noise
+        )
+        self.counter -= 1
+        return noisy_samples

utils/__pycache__/dataset.cpython-39.pyc ADDED Viewed

Binary file (28.2 kB). View file

utils/__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (8.55 kB). View file

utils/dataset.py ADDED Viewed

	@@ -0,0 +1,1304 @@

+import os
+import cv2
+import glob
+import random
+import timeit
+import numpy as np
+import skimage
+import yaml
+import torch
+import torchvision.transforms as transforms
+import torchvision.transforms.functional as TF
+from PIL import Image
+from torch.utils.data import Dataset
+from torch.distributions import Normal
+# from utils.utils import RGB2YCbCr
+class RandomGammaCorrection(object):
+    def __init__(self, gamma=None):
+        self.gamma = gamma
+    def __call__(self, image):
+        if self.gamma == None:
+            # more chances of selecting 0 (original image)
+            gammas = [0.5, 1, 2]
+            self.gamma = random.choice(gammas)
+            return TF.adjust_gamma(image, self.gamma, gain=1)
+        elif isinstance(self.gamma, tuple):
+            gamma = random.uniform(*self.gamma)
+            return TF.adjust_gamma(image, gamma, gain=1)
+        elif self.gamma == 0:
+            return image
+        else:
+            return TF.adjust_gamma(image, self.gamma, gain=1)
+def remove_background(image):
+    # the input of the image is PIL.Image form with [H,W,C]
+    image = np.float32(np.array(image))
+    _EPS = 1e-7
+    rgb_max = np.max(image, (0, 1))
+    rgb_min = np.min(image, (0, 1))
+    image = (image - rgb_min) * rgb_max / (rgb_max - rgb_min + _EPS)
+    image = torch.from_numpy(image)
+    return image
+def glod_from_folder(folder_list, index_list):
+    ext = ["png", "jpeg", "jpg", "bmp", "tif"]
+    index_dict = {}
+    for i, folder_name in enumerate(folder_list):
+        data_list = []
+        [data_list.extend(glob.glob(folder_name + "/*." + e)) for e in ext]
+        data_list.sort()
+        index_dict[index_list[i]] = data_list
+    return index_dict
+class Flare_Image_Loader(Dataset):
+    def __init__(self, image_path, transform_base, transform_flare, mask_type=None):
+        self.ext = ["png", "jpeg", "jpg", "bmp", "tif"]
+        self.data_list = []
+        [self.data_list.extend(glob.glob(image_path + "/*." + e)) for e in self.ext]
+        self.flare_dict = {}
+        self.flare_list = []
+        self.flare_name_list = []
+        self.reflective_flag = False
+        self.reflective_dict = {}
+        self.reflective_list = []
+        self.reflective_name_list = []
+        self.light_flag = False
+        self.light_dict = {}
+        self.light_list = []
+        self.light_name_list = []
+        self.mask_type = (
+            mask_type  # It is a str which may be None,"luminance" or "color"
+        )
+        self.img_size = transform_base["img_size"]
+        self.transform_base = transforms.Compose(
+            [
+                transforms.RandomCrop(
+                    (self.img_size, self.img_size),
+                    pad_if_needed=True,
+                    padding_mode="reflect",
+                ),
+                transforms.RandomHorizontalFlip(),
+                # transforms.RandomVerticalFlip(),
+            ]
+        )
+        self.transform_flare = transforms.Compose(
+            [
+                transforms.RandomAffine(
+                    degrees=(0, 360),
+                    scale=(transform_flare["scale_min"], transform_flare["scale_max"]),
+                    translate=(
+                        transform_flare["translate"] / 1440,
+                        transform_flare["translate"] / 1440,
+                    ),
+                    shear=(-transform_flare["shear"], transform_flare["shear"]),
+                ),
+                transforms.CenterCrop((self.img_size, self.img_size)),
+                transforms.RandomHorizontalFlip(),
+                transforms.RandomVerticalFlip(),
+            ]
+        )
+        self.normalize = transforms.Compose(
+            [
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+        self.data_ratio = []
+    def lightsource_crop(self, matrix):
+        """Find the largest rectangle of 1s in a binary matrix."""
+        def largestRectangleArea(heights):
+            heights.append(0)
+            stack = [-1]
+            max_area = 0
+            max_rectangle = (0, 0, 0, 0)  # (area, left, right, height)
+            for i in range(len(heights)):
+                while heights[i] < heights[stack[-1]]:
+                    h = heights[stack.pop()]
+                    w = i - stack[-1] - 1
+                    area = h * w
+                    if area > max_area:
+                        max_area = area
+                        max_rectangle = (area, stack[-1] + 1, i - 1, h)
+                stack.append(i)
+            heights.pop()
+            return max_rectangle
+        max_area = 0
+        max_rectangle = [0, 0, 0, 0]  # (left, right, top, bottom)
+        heights = torch.zeros(matrix.shape[1])
+        for row in range(matrix.shape[0]):
+            temp = 1 - matrix[row]
+            heights = (heights + temp) * temp
+            area, left, right, height = largestRectangleArea(heights.tolist())
+            if area > max_area:
+                max_area = area
+                max_rectangle = [int(left), int(right), int(row - height + 1), int(row)]
+        return torch.tensor(max_rectangle)
+    def __getitem__(self, index):
+        # load base image
+        img_path = self.data_list[index]
+        base_img = Image.open(img_path).convert("RGB")
+        gamma = np.random.uniform(1.8, 2.2)
+        to_tensor = transforms.ToTensor()
+        adjust_gamma = RandomGammaCorrection(gamma)
+        adjust_gamma_reverse = RandomGammaCorrection(1 / gamma)
+        color_jitter = transforms.ColorJitter(brightness=(0.8, 3), hue=0.0)
+        if self.transform_base is not None:
+            base_img = to_tensor(base_img)
+            base_img = adjust_gamma(base_img)
+            base_img = self.transform_base(base_img)
+        else:
+            base_img = to_tensor(base_img)
+            base_img = adjust_gamma(base_img)
+        sigma_chi = 0.01 * np.random.chisquare(df=1)
+        base_img = Normal(base_img, sigma_chi).sample()
+        gain = np.random.uniform(0.5, 1.2)
+        flare_DC_offset = np.random.uniform(-0.02, 0.02)
+        base_img = gain * base_img
+        base_img = torch.clamp(base_img, min=0, max=1)
+        choice_dataset = random.choices(
+            [i for i in range(len(self.flare_list))], self.data_ratio
+        )[0]
+        choice_index = random.randint(0, len(self.flare_list[choice_dataset]) - 1)
+        # load flare and light source image
+        if self.light_flag:
+            assert len(self.flare_list) == len(
+                self.light_list
+            ), "Error, number of light source and flares dataset no match!"
+            for i in range(len(self.flare_list)):
+                assert len(self.flare_list[i]) == len(
+                    self.light_list[i]
+                ), f"Error, number of light source and flares no match in {i} dataset!"
+            flare_path = self.flare_list[choice_dataset][choice_index]
+            light_path = self.light_list[choice_dataset][choice_index]
+            light_img = Image.open(light_path).convert("RGB")
+            light_img = to_tensor(light_img)
+            light_img = adjust_gamma(light_img)
+        else:
+            flare_path = self.flare_list[choice_dataset][choice_index]
+        flare_img = Image.open(flare_path).convert("RGB")
+        if self.reflective_flag:
+            reflective_path_list = self.reflective_list[choice_dataset]
+            if len(reflective_path_list) != 0:
+                reflective_path = random.choice(reflective_path_list)
+                reflective_img = Image.open(reflective_path).convert("RGB")
+            else:
+                reflective_img = None
+        flare_img = to_tensor(flare_img)
+        flare_img = adjust_gamma(flare_img)
+        if self.reflective_flag and reflective_img is not None:
+            reflective_img = to_tensor(reflective_img)
+            reflective_img = adjust_gamma(reflective_img)
+            flare_img = torch.clamp(flare_img + reflective_img, min=0, max=1)
+        flare_img = remove_background(flare_img)
+        if self.transform_flare is not None:
+            if self.light_flag:
+                flare_merge = torch.cat((flare_img, light_img), dim=0)
+                flare_merge = self.transform_flare(flare_merge)
+            else:
+                flare_img = self.transform_flare(flare_img)
+        # change color
+        if self.light_flag:
+            # flare_merge=color_jitter(flare_merge)
+            flare_img, light_img = torch.split(flare_merge, 3, dim=0)
+        else:
+            flare_img = color_jitter(flare_img)
+        # flare blur
+        blur_transform = transforms.GaussianBlur(21, sigma=(0.1, 3.0))
+        flare_img = blur_transform(flare_img)
+        # flare_img=flare_img+flare_DC_offset
+        flare_img = torch.clamp(flare_img, min=0, max=1)
+        # merge image
+        merge_img = flare_img + base_img
+        merge_img = torch.clamp(merge_img, min=0, max=1)
+        if self.light_flag:
+            base_img = base_img + light_img
+            base_img = torch.clamp(base_img, min=0, max=1)
+            flare_img = flare_img - light_img
+            flare_img = torch.clamp(flare_img, min=0, max=1)
+        flare_mask = None
+        if self.mask_type == None:
+            return {
+                "gt": adjust_gamma_reverse(base_img),
+                "flare": adjust_gamma_reverse(flare_img),
+                "lq": adjust_gamma_reverse(merge_img),
+                "gamma": gamma,
+            }
+        elif self.mask_type == "luminance":
+            # calculate mask (the mask is 3 channel)
+            one = torch.ones_like(base_img)
+            zero = torch.zeros_like(base_img)
+            luminance = 0.3 * flare_img[0] + 0.59 * flare_img[1] + 0.11 * flare_img[2]
+            threshold_value = 0.99**gamma
+            flare_mask = torch.where(luminance > threshold_value, one, zero)
+        elif self.mask_type == "color":
+            one = torch.ones_like(base_img)
+            zero = torch.zeros_like(base_img)
+            threshold_value = 0.99**gamma
+            flare_mask = torch.where(merge_img > threshold_value, one, zero)
+        elif self.mask_type == "flare":
+            one = torch.ones_like(base_img)
+            zero = torch.zeros_like(base_img)
+            threshold_value = 0.7**gamma
+            flare_mask = torch.where(flare_img > threshold_value, one, zero)
+        elif self.mask_type == "light":
+            # Depreciated: we dont need light mask anymore
+            one = torch.ones_like(base_img)
+            zero = torch.zeros_like(base_img)
+            luminance = 0.3 * light_img[0] + 0.59 * light_img[1] + 0.11 * light_img[2]
+            threshold_value = 0.01
+            flare_mask = torch.where(luminance > threshold_value, one, zero)
+            light_source_cond = torch.zeros_like(flare_mask[0])
+            light_source_cond = (flare_mask[0] + flare_mask[1] + flare_mask[2]) > 0
+            light_source_cond = light_source_cond.float()
+            light_source_cond = torch.repeat_interleave(
+                light_source_cond[None, ...], 3, dim=0
+            )
+            # box = self.crop(light_source_cond[0])
+            box = self.lightsource_crop(light_source_cond[0])
+            # random int between -15 ~ 15
+            margin = random.randint(-15, 15)
+            if box[0] - margin >= 0:
+                box[0] -= margin
+            if box[1] + margin < self.img_size:
+                box[1] += margin
+            if box[2] - margin >= 0:
+                box[2] -= margin
+            if box[3] + margin < self.img_size:
+                box[3] += margin
+            top, bottom, left, right = box[2], box[3], box[0], box[1]
+            merge_img = adjust_gamma_reverse(merge_img)
+            cropped_mask = torch.ones((self.img_size, self.img_size))
+            cropped_mask[top : bottom + 1, left : right + 1] = False
+            cropped_mask = torch.repeat_interleave(cropped_mask[None, ...], 1, dim=0)
+            channel3_mask = cropped_mask.repeat(3, 1, 1)
+            masked_img = merge_img * (1 - channel3_mask)
+            masked_img[channel3_mask == 1] = 0.5
+        return {
+            # add
+            "pixel_values": self.normalize(merge_img),
+            "masks": cropped_mask,
+            "masked_images": self.normalize(masked_img),
+            "conditioning_pixel_values": light_source_cond,
+        }
+    def __len__(self):
+        return len(self.data_list)
+    def load_scattering_flare(self, flare_name, flare_path):
+        flare_list = []
+        [flare_list.extend(glob.glob(flare_path + "/*." + e)) for e in self.ext]
+        flare_list = sorted(flare_list)
+        self.flare_name_list.append(flare_name)
+        self.flare_dict[flare_name] = flare_list
+        self.flare_list.append(flare_list)
+        len_flare_list = len(self.flare_dict[flare_name])
+        if len_flare_list == 0:
+            print("ERROR: scattering flare images are not loaded properly")
+        else:
+            print(
+                "Scattering Flare Image:",
+                flare_name,
+                " is loaded successfully with examples",
+                str(len_flare_list),
+            )
+        # print("Now we have", len(self.flare_list), "scattering flare images")
+    def load_light_source(self, light_name, light_path):
+        # The number of the light source images should match the number of scattering flares
+        light_list = []
+        [light_list.extend(glob.glob(light_path + "/*." + e)) for e in self.ext]
+        light_list = sorted(light_list)
+        self.flare_name_list.append(light_name)
+        self.light_dict[light_name] = light_list
+        self.light_list.append(light_list)
+        len_light_list = len(self.light_dict[light_name])
+        if len_light_list == 0:
+            print("ERROR: Light Source images are not loaded properly")
+        else:
+            self.light_flag = True
+            print(
+                "Light Source Image:",
+                light_name,
+                " is loaded successfully with examples",
+                str(len_light_list),
+            )
+        # print("Now we have", len(self.light_list), "light source images")
+    def load_reflective_flare(self, reflective_name, reflective_path):
+        if reflective_path is None:
+            reflective_list = []
+        else:
+            reflective_list = []
+            [
+                reflective_list.extend(glob.glob(reflective_path + "/*." + e))
+                for e in self.ext
+            ]
+            reflective_list = sorted(reflective_list)
+        self.reflective_name_list.append(reflective_name)
+        self.reflective_dict[reflective_name] = reflective_list
+        self.reflective_list.append(reflective_list)
+        len_reflective_list = len(self.reflective_dict[reflective_name])
+        if len_reflective_list == 0 and reflective_path is not None:
+            print("ERROR: reflective flare images are not loaded properly")
+        else:
+            self.reflective_flag = True
+            print(
+                "Reflective Flare Image:",
+                reflective_name,
+                " is loaded successfully with examples",
+                str(len_reflective_list),
+            )
+        # print("Now we have", len(self.reflective_list), "refelctive flare images")
+class Flare7kpp_Pair_Loader(Flare_Image_Loader):
+    def __init__(self, config):
+        Flare_Image_Loader.__init__(
+            self,
+            config["image_path"],
+            config["transform_base"],
+            config["transform_flare"],
+            config["mask_type"],
+        )
+        scattering_dict = config["scattering_dict"]
+        reflective_dict = config["reflective_dict"]
+        light_dict = config["light_dict"]
+        # defualt not use light mask if opt['use_light_mask'] is not declared
+        if "data_ratio" not in config or len(config["data_ratio"]) == 0:
+            self.data_ratio = [1] * len(scattering_dict)
+        else:
+            self.data_ratio = config["data_ratio"]
+        if len(scattering_dict) != 0:
+            for key in scattering_dict.keys():
+                self.load_scattering_flare(key, scattering_dict[key])
+        if len(reflective_dict) != 0:
+            for key in reflective_dict.keys():
+                self.load_reflective_flare(key, reflective_dict[key])
+        if len(light_dict) != 0:
+            for key in light_dict.keys():
+                self.load_light_source(key, light_dict[key])
+class Lightsource_Regress_Loader(Flare7kpp_Pair_Loader):
+    def __init__(self, config, num_lights=4):
+        Flare7kpp_Pair_Loader.__init__(self, config)
+        self.transform_flare = transforms.Compose(
+            [
+                transforms.RandomAffine(
+                    degrees=(0, 360),
+                    scale=(
+                        config["transform_flare"]["scale_min"],
+                        config["transform_flare"]["scale_max"],
+                    ),
+                    shear=(
+                        -config["transform_flare"]["shear"],
+                        config["transform_flare"]["shear"],
+                    ),
+                ),
+                # transforms.CenterCrop((self.img_size, self.img_size)),
+            ]
+        )
+        self.mask_type = "light"
+        self.num_lights = num_lights
+    def __getitem__(self, index):
+        # load base image
+        img_path = self.data_list[index]
+        base_img = Image.open(img_path).convert("RGB")
+        gamma = np.random.uniform(1.8, 2.2)
+        to_tensor = transforms.ToTensor()
+        adjust_gamma = RandomGammaCorrection(gamma)
+        adjust_gamma_reverse = RandomGammaCorrection(1 / gamma)
+        color_jitter = transforms.ColorJitter(brightness=(0.8, 3), hue=0.0)
+        base_img = to_tensor(base_img)
+        base_img = adjust_gamma(base_img)
+        if self.transform_base is not None:
+            base_img = self.transform_base(base_img)
+        sigma_chi = 0.01 * np.random.chisquare(df=1)
+        base_img = Normal(base_img, sigma_chi).sample()
+        gain = np.random.uniform(0.5, 1.2)
+        base_img = gain * base_img
+        base_img = torch.clamp(base_img, min=0, max=1)
+        # init flare and light imgs
+        flare_imgs = []
+        light_imgs = []
+        position = [
+            [[-224, 0], [-224, 0]],
+            [[-224, 0], [0, 224]],
+            [[0, 224], [-224, 0]],
+            [[0, 224], [0, 224]],
+        ]
+        axis = random.sample(range(4), 4)
+        axis[-1] = axis[0]
+        flare_nums = int(
+            random.random() * self.num_lights + 1
+        )  # random number of flares from 1 to 4
+        for fn in range(flare_nums):
+            choice_dataset = random.choices(
+                [i for i in range(len(self.flare_list))], self.data_ratio
+            )[0]
+            choice_index = random.randint(0, len(self.flare_list[choice_dataset]) - 1)
+            flare_path = self.flare_list[choice_dataset][choice_index]
+            flare_img = Image.open(flare_path).convert("RGB")
+            flare_img = to_tensor(flare_img)
+            flare_img = adjust_gamma(flare_img)
+            flare_img = remove_background(flare_img)
+            if self.light_flag:
+                light_path = self.light_list[choice_dataset][choice_index]
+                light_img = Image.open(light_path).convert("RGB")
+                light_img = to_tensor(light_img)
+                light_img = adjust_gamma(light_img)
+            if self.transform_flare is not None:
+                if self.light_flag:
+                    flare_merge = torch.cat((flare_img, light_img), dim=0)
+                    if flare_nums == 1:
+                        dx = random.randint(-224, 224)
+                        dy = random.randint(-224, 224)
+                    else:
+                        dx = random.randint(
+                            position[axis[fn]][0][0], position[axis[fn]][0][1]
+                        )
+                        dy = random.randint(
+                            position[axis[fn]][1][0], position[axis[fn]][1][1]
+                        )
+                        if -160 < dx < 160 and -160 < dy < 160:
+                            if random.random() < 0.5:
+                                dx = 160 if dx > 0 else -160
+                            else:
+                                dy = 160 if dy > 0 else -160
+                    flare_merge = self.transform_flare(flare_merge)
+                    flare_merge = TF.affine(
+                        flare_merge, angle=0, translate=(dx, dy), scale=1.0, shear=0
+                    )
+                    flare_merge = TF.center_crop(
+                        flare_merge, (self.img_size, self.img_size)
+                    )
+                else:
+                    flare_img = self.transform_flare(flare_img)
+            # change color
+            if self.light_flag:
+                flare_img, light_img = torch.split(flare_merge, 3, dim=0)
+            else:
+                flare_img = color_jitter(flare_img)
+            flare_imgs.append(flare_img)
+            if self.light_flag:
+                light_img = torch.clamp(light_img, min=0, max=1)
+                light_imgs.append(light_img)
+        flare_img = torch.sum(torch.stack(flare_imgs), dim=0)
+        flare_img = torch.clamp(flare_img, min=0, max=1)
+        # flare blur
+        blur_transform = transforms.GaussianBlur(21, sigma=(0.1, 3.0))
+        flare_img = blur_transform(flare_img)
+        flare_img = torch.clamp(flare_img, min=0, max=1)
+        merge_img = torch.clamp(flare_img + base_img, min=0, max=1)
+        if self.light_flag:
+            light_img = torch.sum(torch.stack(light_imgs), dim=0)
+            light_img = torch.clamp(light_img, min=0, max=1)
+            base_img = torch.clamp(base_img + light_img, min=0, max=1)
+            flare_img = torch.clamp(flare_img - light_img, min=0, max=1)
+        flare_mask = None
+        if self.mask_type == None:
+            return {
+                "gt": adjust_gamma_reverse(base_img),
+                "flare": adjust_gamma_reverse(flare_img),
+                "lq": adjust_gamma_reverse(merge_img),
+                "gamma": gamma,
+            }
+        elif self.mask_type == "light":
+            one = torch.ones_like(base_img)
+            zero = torch.zeros_like(base_img)
+            threshold_value = 0.01
+            # flare_masks_list = []
+            XYRs = torch.zeros((self.num_lights, 4))
+            for i in range(flare_nums):
+                luminance = (
+                    0.3 * light_imgs[i][0]
+                    + 0.59 * light_imgs[i][1]
+                    + 0.11 * light_imgs[i][2]
+                )
+                flare_mask = torch.where(luminance > threshold_value, one, zero)
+                light_source_cond = (flare_mask.sum(dim=0) > 0).float()
+                x, y, r = self.find_circle_properties(light_source_cond, i)
+                XYRs[i] = torch.tensor([x, y, r, 1.0])
+            XYRs[:, :3] = XYRs[:, :3] / self.img_size
+            luminance = 0.3 * light_img[0] + 0.59 * light_img[1] + 0.11 * light_img[2]
+            flare_mask = torch.where(luminance > threshold_value, one, zero)
+            light_source_cond = (flare_mask.sum(dim=0) > 0).float()
+            light_source_cond = torch.repeat_interleave(
+                light_source_cond[None, ...], 1, dim=0
+            )
+            # box = self.crop(light_source_cond[0])
+            box = self.lightsource_crop(light_source_cond[0])
+            # random int between 0 ~ 15
+            margin = random.randint(0, 15)
+            if box[0] - margin >= 0:
+                box[0] -= margin
+            if box[1] + margin < self.img_size:
+                box[1] += margin
+            if box[2] - margin >= 0:
+                box[2] -= margin
+            if box[3] + margin < self.img_size:
+                box[3] += margin
+            top, bottom, left, right = box[2], box[3], box[0], box[1]
+            merge_img = adjust_gamma_reverse(merge_img)
+            cropped_mask = torch.full(
+                (self.img_size, self.img_size), True, dtype=torch.bool
+            )
+            cropped_mask[top : bottom + 1, left : right + 1] = False
+            channel3_mask = cropped_mask.unsqueeze(0).expand(3, -1, -1)
+            masked_img = merge_img * (1 - channel3_mask.float())
+            masked_img[channel3_mask] = 0.5
+        return {
+            # add
+            "input": self.normalize(masked_img),  # normalize to [-1, 1]
+            "light_masks": light_source_cond,
+            "xyrs": XYRs,
+        }
+    def find_circle_properties(self, mask, i, method="minEnclosingCircle"):
+        """
+        Find the properties of the light source circle in the mask.
+        """
+        _mask = (mask.numpy() * 255).astype(np.uint8)
+        _, binary_mask = cv2.threshold(_mask, 127, 255, cv2.THRESH_BINARY)
+        contours, _ = cv2.findContours(
+            binary_mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE
+        )
+        if len(contours) == 0:
+            return 0.0, 0.0, 0.0
+        largest_contour = max(contours, key=cv2.contourArea)
+        if method == "minEnclosingCircle":
+            (x, y), radius = cv2.minEnclosingCircle(largest_contour)
+        elif method == "area_based":
+            M = cv2.moments(largest_contour)
+            if M["m00"] == 0:  # if the contour is too small
+                return 0.0, 0.0, 0.0
+            x = M["m10"] / M["m00"]
+            y = M["m01"] / M["m00"]
+            area = cv2.contourArea(largest_contour)
+            radius = np.sqrt(area / np.pi)
+        # # draw
+        # cv2.circle(_mask, (int(x), int(y)), int(radius), 128, 2)
+        # cv2.imwrite(f"mask_{i}.png", _mask)
+        return x, y, radius
+class Lightsource_3Maps_Loader(Lightsource_Regress_Loader):
+    def __init__(self, config, num_lights=4):
+        Lightsource_Regress_Loader.__init__(self, config, num_lights=num_lights)
+    def build_gt_maps(self, coords, radii, H, W, kappa=0.4):
+        yy, xx = torch.meshgrid(torch.arange(H), torch.arange(W), indexing="ij")
+        prob_gt = torch.zeros((H, W))
+        rad_gt = torch.zeros((H, W))
+        eps = 1e-6
+        for x_i, y_i, r_i in zip(coords[:, 0], coords[:, 1], radii):
+            if r_i < 1.0:
+                continue
+            sigma = kappa * r_i
+            g = torch.exp(-((xx - x_i) ** 2 + (yy - y_i) ** 2) / (2 * sigma**2))
+            g_prime = torch.exp(
+                -((xx - x_i) ** 2 + (yy - y_i) ** 2) / (2 * (sigma / 1.414) ** 2)
+            )
+            prob_gt = torch.maximum(prob_gt, g)
+            rad_gt = torch.maximum(rad_gt, g_prime * r_i)
+        rad_gt = rad_gt / (prob_gt + eps)
+        return prob_gt, rad_gt
+    def __getitem__(self, index):
+        # load base image
+        img_path = self.data_list[index]
+        base_img = Image.open(img_path).convert("RGB")
+        gamma = np.random.uniform(1.8, 2.2)
+        to_tensor = transforms.ToTensor()
+        adjust_gamma = RandomGammaCorrection(gamma)
+        adjust_gamma_reverse = RandomGammaCorrection(1 / gamma)
+        color_jitter = transforms.ColorJitter(brightness=(0.8, 3), hue=0.0)
+        base_img = to_tensor(base_img)
+        base_img = adjust_gamma(base_img)
+        if self.transform_base is not None:
+            base_img = self.transform_base(base_img)
+        sigma_chi = 0.01 * np.random.chisquare(df=1)
+        base_img = Normal(base_img, sigma_chi).sample()
+        gain = np.random.uniform(0.5, 1.2)
+        base_img = gain * base_img
+        base_img = torch.clamp(base_img, min=0, max=1)
+        # init flare and light imgs
+        flare_imgs = []
+        light_imgs = []
+        position = [
+            [[-224, 0], [-224, 0]],
+            [[-224, 0], [0, 224]],
+            [[0, 224], [-224, 0]],
+            [[0, 224], [0, 224]],
+        ]
+        axis = random.sample(range(4), 4)
+        axis[-1] = axis[0]
+        flare_nums = int(
+            random.random() * self.num_lights + 1
+        )  # random number of flares from 1 to 4
+        for fn in range(flare_nums):
+            choice_dataset = random.choices(
+                [i for i in range(len(self.flare_list))], self.data_ratio
+            )[0]
+            choice_index = random.randint(0, len(self.flare_list[choice_dataset]) - 1)
+            flare_path = self.flare_list[choice_dataset][choice_index]
+            flare_img = Image.open(flare_path).convert("RGB")
+            flare_img = to_tensor(flare_img)
+            flare_img = adjust_gamma(flare_img)
+            flare_img = remove_background(flare_img)
+            if self.light_flag:
+                light_path = self.light_list[choice_dataset][choice_index]
+                light_img = Image.open(light_path).convert("RGB")
+                light_img = to_tensor(light_img)
+                light_img = adjust_gamma(light_img)
+            if self.transform_flare is not None:
+                if self.light_flag:
+                    flare_merge = torch.cat((flare_img, light_img), dim=0)
+                    if flare_nums == 1:
+                        dx = random.randint(-224, 224)
+                        dy = random.randint(-224, 224)
+                    else:
+                        dx = random.randint(
+                            position[axis[fn]][0][0], position[axis[fn]][0][1]
+                        )
+                        dy = random.randint(
+                            position[axis[fn]][1][0], position[axis[fn]][1][1]
+                        )
+                        if -160 < dx < 160 and -160 < dy < 160:
+                            if random.random() < 0.5:
+                                dx = 160 if dx > 0 else -160
+                            else:
+                                dy = 160 if dy > 0 else -160
+                    flare_merge = self.transform_flare(flare_merge)
+                    flare_merge = TF.affine(
+                        flare_merge, angle=0, translate=(dx, dy), scale=1.0, shear=0
+                    )
+                    flare_merge = TF.center_crop(
+                        flare_merge, (self.img_size, self.img_size)
+                    )
+                else:
+                    flare_img = self.transform_flare(flare_img)
+            # change color
+            if self.light_flag:
+                flare_img, light_img = torch.split(flare_merge, 3, dim=0)
+            else:
+                flare_img = color_jitter(flare_img)
+            flare_imgs.append(flare_img)
+            if self.light_flag:
+                light_img = torch.clamp(light_img, min=0, max=1)
+                light_imgs.append(light_img)
+        flare_img = torch.sum(torch.stack(flare_imgs), dim=0)
+        flare_img = torch.clamp(flare_img, min=0, max=1)
+        # flare blur
+        blur_transform = transforms.GaussianBlur(21, sigma=(0.1, 3.0))
+        flare_img = blur_transform(flare_img)
+        flare_img = torch.clamp(flare_img, min=0, max=1)
+        merge_img = torch.clamp(flare_img + base_img, min=0, max=1)
+        if self.light_flag:
+            light_img = torch.sum(torch.stack(light_imgs), dim=0)
+            light_img = torch.clamp(light_img, min=0, max=1)
+            base_img = torch.clamp(base_img + light_img, min=0, max=1)
+            flare_img = torch.clamp(flare_img - light_img, min=0, max=1)
+        flare_mask = None
+        if self.mask_type == None:
+            return {
+                "gt": adjust_gamma_reverse(base_img),
+                "flare": adjust_gamma_reverse(flare_img),
+                "lq": adjust_gamma_reverse(merge_img),
+                "gamma": gamma,
+            }
+        elif self.mask_type == "light":
+            one = torch.ones_like(base_img)
+            zero = torch.zeros_like(base_img)
+            threshold_value = 0.01
+            # flare_masks_list = []
+            XYRs = torch.zeros((self.num_lights, 4))
+            for i in range(flare_nums):
+                luminance = (
+                    0.3 * light_imgs[i][0]
+                    + 0.59 * light_imgs[i][1]
+                    + 0.11 * light_imgs[i][2]
+                )
+                flare_mask = torch.where(luminance > threshold_value, one, zero)
+                light_source_cond = (flare_mask.sum(dim=0) > 0).float()
+                x, y, r = self.find_circle_properties(light_source_cond, i)
+                XYRs[i] = torch.tensor([x, y, r, 1.0])
+            gt_prob, gt_rad = self.build_gt_maps(
+                XYRs[:, :2], XYRs[:, 2], self.img_size, self.img_size
+            )
+            gt_prob = gt_prob.unsqueeze(0)  # shape: (1, H, W)
+            gt_rad = gt_rad.unsqueeze(0)
+            gt_rad /= self.img_size
+            gt_maps = torch.cat((gt_prob, gt_rad), dim=0)  # shape: (2, H, W)
+            XYRs[:, :3] = XYRs[:, :3] / self.img_size
+            luminance = 0.3 * light_img[0] + 0.59 * light_img[1] + 0.11 * light_img[2]
+            flare_mask = torch.where(luminance > threshold_value, one, zero)
+            light_source_cond = (flare_mask.sum(dim=0) > 0).float()
+            light_source_cond = torch.repeat_interleave(
+                light_source_cond[None, ...], 1, dim=0
+            )
+            # box = self.crop(light_source_cond[0])
+            box = self.lightsource_crop(light_source_cond[0])
+            # random int between 0 ~ 15
+            margin = random.randint(0, 15)
+            if box[0] - margin >= 0:
+                box[0] -= margin
+            if box[1] + margin < self.img_size:
+                box[1] += margin
+            if box[2] - margin >= 0:
+                box[2] -= margin
+            if box[3] + margin < self.img_size:
+                box[3] += margin
+            top, bottom, left, right = box[2], box[3], box[0], box[1]
+            merge_img = adjust_gamma_reverse(merge_img)
+            cropped_mask = torch.full(
+                (self.img_size, self.img_size), True, dtype=torch.bool
+            )
+            cropped_mask[top : bottom + 1, left : right + 1] = False
+            channel3_mask = cropped_mask.unsqueeze(0).expand(3, -1, -1)
+            masked_img = merge_img * (1 - channel3_mask.float())
+            masked_img[channel3_mask] = 0.5
+            return {
+                # add
+                "input": self.normalize(masked_img),  # normalize to [-1, 1]
+                "light_masks": light_source_cond,
+                "xyrs": gt_maps,
+            }
+class TestImageLoader(Dataset):
+    def __init__(
+        self,
+        dataroot_gt,
+        dataroot_input,
+        dataroot_mask,
+        margin=0,
+        img_size=512,
+        noise_matching=False,
+    ):
+        super(TestImageLoader, self).__init__()
+        self.gt_folder = dataroot_gt
+        self.input_folder = dataroot_input
+        self.mask_folder = dataroot_mask
+        self.paths = glod_from_folder(
+            [self.input_folder, self.gt_folder, self.mask_folder],
+            ["input", "gt", "mask"],
+        )
+        self.margin = margin
+        self.img_size = img_size
+        self.noise_matching = noise_matching
+    def __len__(self):
+        return len(self.paths["input"])
+    def __getitem__(self, index):
+        img_name = self.paths["input"][index].split("/")[-1]
+        num = img_name.split("_")[1].split(".")[0]
+        # preprocess light source mask
+        light_mask = np.array(Image.open(self.paths["mask"][index]))
+        tmp_light_mask = np.zeros_like(light_mask[:, :, 0])
+        tmp_light_mask[light_mask[:, :, 2] > 0] = 255
+        cond = (light_mask[:, :, 0] > 0) & (light_mask[:, :, 1] > 0)
+        tmp_light_mask[cond] = 0
+        light_mask = tmp_light_mask
+        # img for controlnet input
+        control_img = np.repeat(light_mask[:, :, None], 3, axis=2)
+        # crop region
+        box = self.lightsource_crop(light_mask)
+        if box[0] - self.margin >= 0:
+            box[0] -= self.margin
+        if box[1] + self.margin < self.img_size:
+            box[1] += self.margin
+        if box[2] - self.margin >= 0:
+            box[2] -= self.margin
+        if box[3] + self.margin < self.img_size:
+            box[3] += self.margin
+        # input image to be outpainted
+        input_img = np.array(Image.open(self.paths["input"][index]))
+        cropped_region = np.ones((self.img_size, self.img_size), dtype=np.uint8)
+        cropped_region[box[2] : box[3] + 1, box[0] : box[1] + 1] = 0
+        input_img[cropped_region == 1] = 128
+        # image for blip
+        blip_img = input_img[box[2] : box[3] + 1, box[0] : box[1] + 1, :]
+        # noise matching
+        input_img_matching = None
+        if self.noise_matching:
+            np_src_img = input_img / 255.0
+            np_mask_rgb = np.repeat(cropped_region[:, :, None], 3, axis=2).astype(
+                np.float32
+            )
+            matched_noise = self.get_matched_noise(np_src_img, np_mask_rgb)
+            input_img_matching = (matched_noise * 255).astype(np.uint8)
+        # mask image
+        mask_img = (cropped_region * 255).astype(np.uint8)
+        return {
+            "blip_img": blip_img,
+            "input_img": Image.fromarray(input_img),
+            "input_img_matching": (
+                Image.fromarray(input_img_matching)
+                if input_img_matching is not None
+                else Image.fromarray(input_img)
+            ),
+            "mask_img": Image.fromarray(mask_img),
+            "control_img": Image.fromarray(control_img),
+            "box": box,
+            "output_name": "output_" + num + ".png",
+        }
+    def lightsource_crop(self, matrix):
+        """Find the largest rectangle of 1s in a binary matrix."""
+        def largestRectangleArea(heights):
+            heights.append(0)
+            stack = [-1]
+            max_area = 0
+            max_rectangle = (0, 0, 0, 0)  # (area, left, right, height)
+            for i in range(len(heights)):
+                while heights[i] < heights[stack[-1]]:
+                    h = heights[stack.pop()]
+                    w = i - stack[-1] - 1
+                    area = h * w
+                    if area > max_area:
+                        max_area = area
+                        max_rectangle = (area, stack[-1] + 1, i - 1, h)
+                stack.append(i)
+            heights.pop()
+            return max_rectangle
+        max_area = 0
+        max_rectangle = [0, 0, 0, 0]  # (left, right, top, bottom)
+        heights = [0] * len(matrix[0])
+        for row in range(len(matrix)):
+            for i, val in enumerate(matrix[row]):
+                heights[i] = heights[i] + 1 if val == 0 else 0
+            area, left, right, height = largestRectangleArea(heights)
+            if area > max_area:
+                max_area = area
+                max_rectangle = [int(left), int(right), int(row - height + 1), int(row)]
+        return list(max_rectangle)
+    # this function is taken from https://github.com/parlance-zz/g-diffuser-bot
+    def get_matched_noise(
+        self, _np_src_image, np_mask_rgb, noise_q=1, color_variation=0.05
+    ):
+        # helper fft routines that keep ortho normalization and auto-shift before and after fft
+        def _fft2(data):
+            if data.ndim > 2:  # has channels
+                out_fft = np.zeros(
+                    (data.shape[0], data.shape[1], data.shape[2]), dtype=np.complex128
+                )
+                for c in range(data.shape[2]):
+                    c_data = data[:, :, c]
+                    out_fft[:, :, c] = np.fft.fft2(
+                        np.fft.fftshift(c_data), norm="ortho"
+                    )
+                    out_fft[:, :, c] = np.fft.ifftshift(out_fft[:, :, c])
+            else:  # one channel
+                out_fft = np.zeros((data.shape[0], data.shape[1]), dtype=np.complex128)
+                out_fft[:, :] = np.fft.fft2(np.fft.fftshift(data), norm="ortho")
+                out_fft[:, :] = np.fft.ifftshift(out_fft[:, :])
+            return out_fft
+        def _ifft2(data):
+            if data.ndim > 2:  # has channels
+                out_ifft = np.zeros(
+                    (data.shape[0], data.shape[1], data.shape[2]), dtype=np.complex128
+                )
+                for c in range(data.shape[2]):
+                    c_data = data[:, :, c]
+                    out_ifft[:, :, c] = np.fft.ifft2(
+                        np.fft.fftshift(c_data), norm="ortho"
+                    )
+                    out_ifft[:, :, c] = np.fft.ifftshift(out_ifft[:, :, c])
+            else:  # one channel
+                out_ifft = np.zeros((data.shape[0], data.shape[1]), dtype=np.complex128)
+                out_ifft[:, :] = np.fft.ifft2(np.fft.fftshift(data), norm="ortho")
+                out_ifft[:, :] = np.fft.ifftshift(out_ifft[:, :])
+            return out_ifft
+        def _get_gaussian_window(width, height, std=3.14, mode=0):
+            window_scale_x = float(width / min(width, height))
+            window_scale_y = float(height / min(width, height))
+            window = np.zeros((width, height))
+            x = (np.arange(width) / width * 2.0 - 1.0) * window_scale_x
+            for y in range(height):
+                fy = (y / height * 2.0 - 1.0) * window_scale_y
+                if mode == 0:
+                    window[:, y] = np.exp(-(x**2 + fy**2) * std)
+                else:
+                    window[:, y] = (1 / ((x**2 + 1.0) * (fy**2 + 1.0))) ** (
+                        std / 3.14
+                    )  # hey wait a minute that's not gaussian
+            return window
+        def _get_masked_window_rgb(np_mask_grey, hardness=1.0):
+            np_mask_rgb = np.zeros((np_mask_grey.shape[0], np_mask_grey.shape[1], 3))
+            if hardness != 1.0:
+                hardened = np_mask_grey[:] ** hardness
+            else:
+                hardened = np_mask_grey[:]
+            for c in range(3):
+                np_mask_rgb[:, :, c] = hardened[:]
+            return np_mask_rgb
+        width = _np_src_image.shape[0]
+        height = _np_src_image.shape[1]
+        num_channels = _np_src_image.shape[2]
+        _np_src_image[:] * (1.0 - np_mask_rgb)
+        np_mask_grey = np.sum(np_mask_rgb, axis=2) / 3.0
+        img_mask = np_mask_grey > 1e-6
+        ref_mask = np_mask_grey < 1e-3
+        windowed_image = _np_src_image * (1.0 - _get_masked_window_rgb(np_mask_grey))
+        windowed_image /= np.max(windowed_image)
+        windowed_image += (
+            np.average(_np_src_image) * np_mask_rgb
+        )  # / (1.-np.average(np_mask_rgb))  # rather than leave the masked area black, we get better results from fft by filling the average unmasked color
+        src_fft = _fft2(windowed_image)  # get feature statistics from masked src img
+        src_dist = np.absolute(src_fft)
+        src_phase = src_fft / src_dist
+        # create a generator with a static seed to make outpainting deterministic / only follow global seed
+        rng = np.random.default_rng(0)
+        noise_window = _get_gaussian_window(
+            width, height, mode=1
+        )  # start with simple gaussian noise
+        noise_rgb = rng.random((width, height, num_channels))
+        noise_grey = np.sum(noise_rgb, axis=2) / 3.0
+        noise_rgb *= color_variation  # the colorfulness of the starting noise is blended to greyscale with a parameter
+        for c in range(num_channels):
+            noise_rgb[:, :, c] += (1.0 - color_variation) * noise_grey
+        noise_fft = _fft2(noise_rgb)
+        for c in range(num_channels):
+            noise_fft[:, :, c] *= noise_window
+        noise_rgb = np.real(_ifft2(noise_fft))
+        shaped_noise_fft = _fft2(noise_rgb)
+        shaped_noise_fft[:, :, :] = (
+            np.absolute(shaped_noise_fft[:, :, :]) ** 2
+            * (src_dist**noise_q)
+            * src_phase
+        )  # perform the actual shaping
+        brightness_variation = 0.0  # color_variation # todo: temporarily tying brightness variation to color variation for now
+        contrast_adjusted_np_src = (
+            _np_src_image[:] * (brightness_variation + 1.0) - brightness_variation * 2.0
+        )
+        # scikit-image is used for histogram matching, very convenient!
+        shaped_noise = np.real(_ifft2(shaped_noise_fft))
+        shaped_noise -= np.min(shaped_noise)
+        shaped_noise /= np.max(shaped_noise)
+        shaped_noise[img_mask, :] = skimage.exposure.match_histograms(
+            shaped_noise[img_mask, :] ** 1.0,
+            contrast_adjusted_np_src[ref_mask, :],
+            channel_axis=1,
+        )
+        shaped_noise = (
+            _np_src_image[:] * (1.0 - np_mask_rgb) + shaped_noise * np_mask_rgb
+        )
+        matched_noise = shaped_noise[:]
+        return np.clip(matched_noise, 0.0, 1.0)
+class CustomImageLoader(Dataset):
+    def __init__(
+        self, dataroot_input, left_outpaint, right_outpaint, up_outpaint, down_outpaint
+    ):
+        self.dataroot_input = dataroot_input
+        self.left_outpaint = left_outpaint
+        self.right_outpaint = right_outpaint
+        self.up_outpaint = up_outpaint
+        self.down_outpaint = down_outpaint
+        self.H = 512 - (up_outpaint + down_outpaint)
+        self.W = 512 - (left_outpaint + right_outpaint)
+        self.img_size = 512
+        self.img_lists = [
+            os.path.join(dataroot_input, f)
+            for f in os.listdir(dataroot_input)
+            if f.endswith(".png") or f.endswith(".jpg")
+        ]
+    def __len__(self):
+        return len(self.img_lists)
+    def __getitem__(self, index):
+        img_name = self.img_lists[index].split("/")[-1]
+        # crop region
+        box = [
+            self.left_outpaint,
+            511 - self.right_outpaint,
+            self.up_outpaint,
+            511 - self.down_outpaint,
+        ]  # [left, right, top, bottom]
+        # box = self.lightsource_crop(light_mask)
+        # if box[0] - self.margin >= 0:
+        #     box[0] -= self.margin
+        # if box[1] + self.margin < self.img_size:
+        #     box[1] += self.margin
+        # if box[2] - self.margin >= 0:
+        #     box[2] -= self.margin
+        # if box[3] + self.margin < self.img_size:
+        #     box[3] += self.margin
+        # input image to be outpainted
+        input_img = np.zeros((self.img_size, self.img_size, 3), dtype=np.uint8)
+        paste_img = np.array(
+            Image.open(self.img_lists[index]).resize((self.W, self.H), Image.LANCZOS)
+        )
+        input_img[box[2] : box[3] + 1, box[0] : box[1] + 1, :] = paste_img
+        cropped_region = np.ones((self.img_size, self.img_size), dtype=np.uint8)
+        cropped_region[box[2] : box[3] + 1, box[0] : box[1] + 1] = 0
+        input_img[cropped_region == 1] = 128
+        # image for blip
+        blip_img = np.array(Image.open(self.img_lists[index]))
+        # # noise matching
+        # input_img_matching = None
+        # if self.noise_matching:
+        #     np_src_img = input_img / 255.0
+        #     np_mask_rgb = np.repeat(cropped_region[:, :, None], 3, axis=2).astype(
+        #         np.float32
+        #     )
+        #     matched_noise = self.get_matched_noise(np_src_img, np_mask_rgb)
+        #     input_img_matching = (matched_noise * 255).astype(np.uint8)
+        # mask image
+        mask_img = (cropped_region * 255).astype(np.uint8)
+        return {
+            "blip_img": blip_img,
+            "input_img": Image.fromarray(input_img),
+            # "input_img": (
+            #     Image.fromarray(input_img_matching)
+            #     if input_img_matching is not None
+            #     else Image.fromarray(input_img)
+            # ),
+            "mask_img": Image.fromarray(mask_img),
+            "box": box,
+            "output_name": img_name,
+        }
+class HFCustomImageLoader(Dataset):
+    def __init__(
+        self, img_data, left_outpaint=64, right_outpaint=64, up_outpaint=64, down_outpaint=64
+    ):
+        self.left_outpaint = left_outpaint
+        self.right_outpaint = right_outpaint
+        self.up_outpaint = up_outpaint
+        self.down_outpaint = down_outpaint
+        self.H = 512 - (up_outpaint + down_outpaint)
+        self.W = 512 - (left_outpaint + right_outpaint)
+        self.img_size = 512
+        self.img_lists = [img_data]
+    def __len__(self):
+        return len(self.img_lists)
+    def __getitem__(self, index):
+        # img_name = self.img_lists[index].split("/")[-1]
+        # crop region
+        box = [
+            self.left_outpaint,
+            511 - self.right_outpaint,
+            self.up_outpaint,
+            511 - self.down_outpaint,
+        ]  # [left, right, top, bottom]
+        # input image to be outpainted
+        input_img = np.zeros((self.img_size, self.img_size, 3), dtype=np.uint8)
+        paste_img = np.array(self.img_lists[index].resize((self.W, self.H), Image.LANCZOS))
+        input_img[box[2] : box[3] + 1, box[0] : box[1] + 1, :] = paste_img
+        cropped_region = np.ones((self.img_size, self.img_size), dtype=np.uint8)
+        cropped_region[box[2] : box[3] + 1, box[0] : box[1] + 1] = 0
+        input_img[cropped_region == 1] = 128
+        # image for blip
+        blip_img = np.array(self.img_lists[index])
+        # # noise matching
+        # input_img_matching = None
+        # if self.noise_matching:
+        #     np_src_img = input_img / 255.0
+        #     np_mask_rgb = np.repeat(cropped_region[:, :, None], 3, axis=2).astype(
+        #         np.float32
+        #     )
+        #     matched_noise = self.get_matched_noise(np_src_img, np_mask_rgb)
+        #     input_img_matching = (matched_noise * 255).astype(np.uint8)
+        # mask image
+        mask_img = (cropped_region * 255).astype(np.uint8)
+        return {
+            "blip_img": blip_img,
+            "input_img": Image.fromarray(input_img),
+            "mask_img": Image.fromarray(mask_img),
+            "box": box,
+        }
+if __name__ == "__main__":
+    pass

utils/loss.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import torch
+import torch.nn.functional as F
+from torch import nn
+from scipy.optimize import linear_sum_assignment
+class uncertainty_light_pos_loss(nn.Module):
+    def __init__(self):
+        super(uncertainty_light_pos_loss, self).__init__()
+        self.log_var_xyr = nn.Parameter(torch.tensor(1.0, requires_grad=True))
+        self.log_var_p = nn.Parameter(torch.tensor(1.0, requires_grad=True))
+    def forward(self, logits, targets):
+        B, N, P = logits.shape  # (B, 4, 4)
+        position_loss = 0
+        confidence_loss = 0
+        w_xyr = 0.5 / (self.log_var_xyr**2)  # uncertainty weight for position loss
+        w_p = 0.5 / (self.log_var_p**2)  # uncertainty weight for confidence loss
+        weights = torch.tensor([1, 1, 2], device=logits.device)  # weights for x, y, r
+        for b in range(B):
+            pred_xyr = logits[b, :, :3]  # (N, 3)
+            pred_p = logits[b, :, 3]  # (N,)
+            gt_xyr = targets[b, :, :3]  # (N, 3)
+            gt_p = targets[b, :, 3]  # (N,)
+            cost_matrix = torch.cdist(gt_xyr, pred_xyr, p=2)  # (N, N)
+            with torch.no_grad():
+                row_ind, col_ind = linear_sum_assignment(cost_matrix.cpu().numpy())
+            matched_pred_xyr = pred_xyr[col_ind]
+            matched_gt_xyr = gt_xyr[row_ind]
+            matched_pred_p = pred_p[col_ind]
+            matched_gt_p = gt_p[row_ind]
+            valid_mask = matched_gt_p > 0
+            valid_cnt = valid_mask.sum().clamp(min=1)
+            xyr_loss = (
+                F.smooth_l1_loss(
+                    matched_pred_xyr[valid_mask],
+                    matched_gt_xyr[valid_mask],
+                    reduction="none",
+                )
+                * weights
+            ).sum()
+            p_loss = F.binary_cross_entropy(
+                matched_pred_p, matched_gt_p, reduction="mean"
+            )
+            position_loss += xyr_loss / valid_cnt
+            confidence_loss += p_loss
+        position_loss = w_xyr * (position_loss / B) + torch.log(1 + self.log_var_xyr**2)
+        confidence_loss = w_p * (confidence_loss / B) + torch.log(1 + self.log_var_p**2)
+        return position_loss, confidence_loss
+class unet_3maps_loss(nn.Module):
+    def __init__(self):
+        super(unet_3maps_loss, self).__init__()
+    def forward(self, pred_prob, pred_rad, prob_gt, rad_gt):
+        focal = nn.BCELoss()
+        L_prob = focal(pred_prob, prob_gt)
+        pos_mask = prob_gt > 0.5
+        L_rad = (
+            nn.functional.smooth_l1_loss(pred_rad[pos_mask], rad_gt[pos_mask])
+            if pos_mask.any()
+            else pred_rad.sum() * 0
+        )
+        return L_prob + 10.0 * L_rad, L_prob, L_rad

utils/utils.py ADDED Viewed

	@@ -0,0 +1,311 @@

+import os
+import cv2
+import numpy as np
+import skimage
+import torch
+import torch.nn.functional as F
+import torchvision.transforms as transforms
+import torchvision.transforms.functional as TF
+from PIL import Image
+from skimage.draw import disk
+from skimage import morphology
+from collections import OrderedDict
+def load_mfdnet_checkpoint(model, weights):
+    checkpoint = torch.load(weights, map_location=lambda storage, loc: storage.cuda(0))
+    new_state_dict = OrderedDict()
+    for key, value in checkpoint["state_dict"].items():
+        if key.startswith("module"):
+            name = key[7:]
+        else:
+            name = key
+        new_state_dict[name] = value
+    model.load_state_dict(new_state_dict)
+def adjust_gamma(image: torch.Tensor, gamma):
+    # image is in shape of [B,C,H,W] and gamma is in shape [B]
+    gamma = gamma.float().cuda()
+    gamma_tensor = torch.ones_like(image)
+    gamma_tensor = gamma.view(-1, 1, 1, 1) * gamma_tensor
+    image = torch.pow(image, gamma_tensor)
+    out = torch.clamp(image, 0.0, 1.0)
+    return out
+def adjust_gamma_reverse(image: torch.Tensor, gamma):
+    # gamma=torch.Tensor([gamma]).cuda()
+    gamma = 1 / gamma.float().cuda()
+    gamma_tensor = torch.ones_like(image)
+    gamma_tensor = gamma.view(-1, 1, 1, 1) * gamma_tensor
+    image = torch.pow(image, gamma_tensor)
+    out = torch.clamp(image, 0.0, 1.0)
+    return out
+def predict_flare_from_6_channel(input_tensor, gamma):
+    # the input is a tensor in [B,C,H,W], the C here is 6
+    deflare_img = input_tensor[:, :3, :, :]
+    flare_img_predicted = input_tensor[:, 3:, :, :]
+    merge_img_predicted_linear = adjust_gamma(deflare_img, gamma) + adjust_gamma(
+        flare_img_predicted, gamma
+    )
+    merge_img_predicted = adjust_gamma_reverse(
+        torch.clamp(merge_img_predicted_linear, 1e-7, 1.0), gamma
+    )
+    return deflare_img, flare_img_predicted, merge_img_predicted
+def predict_flare_from_3_channel(
+    input_tensor, flare_mask, base_img, flare_img, merge_img, gamma
+):
+    # the input is a tensor in [B,C,H,W], the C here is 3
+    input_tensor_linear = adjust_gamma(input_tensor, gamma)
+    merge_tensor_linear = adjust_gamma(merge_img, gamma)
+    flare_img_predicted = adjust_gamma_reverse(
+        torch.clamp(merge_tensor_linear - input_tensor_linear, 1e-7, 1.0), gamma
+    )
+    masked_deflare_img = input_tensor * (1 - flare_mask) + base_img * flare_mask
+    masked_flare_img_predicted = (
+        flare_img_predicted * (1 - flare_mask) + flare_img * flare_mask
+    )
+    return masked_deflare_img, masked_flare_img_predicted
+def get_highlight_mask(image, threshold=0.99, luminance_mode=False):
+    """Get the area close to the exposure
+    Args:
+        image: the image tensor in [B,C,H,W]. For inference, B is set as 1.
+        threshold: the threshold of luminance/greyscale of exposure region
+        luminance_mode: use luminance or greyscale
+    Return:
+        Binary image in [B,H,W]
+    """
+    if luminance_mode:
+        # 3 channels in RGB
+        luminance = (
+            0.2126 * image[:, 0, :, :]
+            + 0.7152 * image[:, 1, :, :]
+            + 0.0722 * image[:, 2, :, :]
+        )
+        binary_mask = luminance > threshold
+    else:
+        binary_mask = image.mean(dim=1, keepdim=True) > threshold
+    binary_mask = binary_mask.to(image.dtype)
+    return binary_mask
+def refine_mask(mask, morph_size=0.01):
+    """Refines a mask by applying mophological operations.
+    Args:
+      mask: A float array of shape [H, W]
+      morph_size: Size of the morphological kernel relative to the long side of
+        the image.
+    Returns:
+      Refined mask of shape [H, W].
+    """
+    mask_size = max(np.shape(mask))
+    kernel_radius = 0.5 * morph_size * mask_size
+    kernel = morphology.disk(np.ceil(kernel_radius))
+    opened = morphology.binary_opening(mask, kernel)
+    return opened
+def _create_disk_kernel(kernel_size):
+    _EPS = 1e-7
+    x = np.arange(kernel_size) - (kernel_size - 1) / 2
+    xx, yy = np.meshgrid(x, x)
+    rr = np.sqrt(xx**2 + yy**2)
+    kernel = np.float32(rr <= np.max(x)) + _EPS
+    kernel = kernel / np.sum(kernel)
+    return kernel
+def blend_light_source(input_scene, pred_scene, threshold=0.99, luminance_mode=False):
+    binary_mask = (
+        get_highlight_mask(
+            input_scene, threshold=threshold, luminance_mode=luminance_mode
+        )
+        > 0.5
+    ).to("cpu", torch.bool)
+    binary_mask = binary_mask.squeeze()  # (h, w)
+    binary_mask = binary_mask.numpy()
+    binary_mask = refine_mask(binary_mask)
+    labeled = skimage.measure.label(binary_mask)
+    properties = skimage.measure.regionprops(labeled)
+    max_diameter = 0
+    for p in properties:
+        # The diameter of a circle with the same area as the region.
+        max_diameter = max(max_diameter, p["equivalent_diameter"])
+    mask = np.float32(binary_mask)
+    kernel_size = round(1.5 * max_diameter)  # default is 1.5
+    if kernel_size > 0:
+        kernel = _create_disk_kernel(kernel_size)
+        mask = cv2.filter2D(mask, -1, kernel)
+        mask = np.clip(mask * 3.0, 0.0, 1.0)
+        mask_rgb = np.stack([mask] * 3, axis=0)
+        mask_rgb = torch.from_numpy(mask_rgb).to(input_scene.device, torch.float32)
+        blend = input_scene * mask_rgb + pred_scene * (1 - mask_rgb)
+    else:
+        blend = pred_scene
+    return blend
+def blend_with_alpha(result, input_img, box, blur_size=31):
+    """
+    Apply alpha blending to paste the specified box region from input_img onto the result image
+    to reduce boundary artifacts and make the blending more natural.
+    Args:
+        result (np.array): inpainting generated image
+        input_img (np.array): original image
+        box (tuple): (x_min, x_max, y_min, y_max) representing the paste-back region from the original image
+        blur_size (int): blur range for the mask, larger values create smoother transitions (recommended 15~50)
+    Returns:
+        np.array: image after alpha blending
+    """
+    x_min, x_max, y_min, y_max = box
+    # alpha mask
+    mask = np.zeros_like(result, dtype=np.float32)
+    mask[y_min : y_max + 1, x_min : x_max + 1] = 1.0
+    # gaussian blur
+    mask = cv2.GaussianBlur(mask, (blur_size, blur_size), 0)
+    # alpha blending
+    blended = (mask * input_img + (1 - mask) * result).astype(np.uint8)
+    return blended
+def IoU(pred, target):
+    assert pred.shape == target.shape, "Prediction and target must have the same shape."
+    intersection = np.logical_and(pred, target).sum()
+    union = np.logical_or(pred, target).sum()
+    if union == 0:
+        return 1.0 if intersection == 0 else 0.0
+    return intersection / union
+def mean_IoU(y_true, y_pred, num_classes):
+    """
+    Calculate the mean Intersection over Union (mIoU) score.
+    Args:
+        y_true (np.ndarray): Ground truth labels (integer class values).
+        y_pred (np.ndarray): Predicted labels (integer class values).
+        num_classes (int): Number of classes.
+    Returns:
+        float: The mean IoU score across all classes.
+    """
+    iou_scores = []
+    for cls in range(num_classes):
+        # Create binary masks for the current class
+        true_mask = y_true == cls
+        pred_mask = y_pred == cls
+        # Calculate intersection and union
+        intersection = np.logical_and(true_mask, pred_mask)
+        union = np.logical_or(true_mask, pred_mask)
+        # Compute IoU for the current class
+        if np.sum(union) == 0:
+            # Handle edge case: no samples for this class
+            iou_scores.append(np.nan)
+        else:
+            iou_scores.append(np.sum(intersection) / np.sum(union))
+    # Calculate mean IoU, ignoring NaN values (classes without samples)
+    mean_iou = np.nanmean(iou_scores)
+    return mean_iou
+def RGB2YCbCr(img):
+    img = img * 255.0
+    r, g, b = torch.split(img, 1, dim=0)
+    y = torch.zeros_like(r)
+    cb = torch.zeros_like(r)
+    cr = torch.zeros_like(r)
+    y = 0.257 * r + 0.504 * g + 0.098 * b + 16
+    y = y / 255.0
+    cb = -0.148 * r - 0.291 * g + 0.439 * b + 128
+    cb = cb / 255.0
+    cr = 0.439 * r - 0.368 * g - 0.071 * b + 128
+    cr = cr / 255.0
+    img = torch.cat([y, y, y], dim=0)
+    return img
+def extract_peaks(prob_map, thr=0.5, pool=7):
+    """
+    prob_map: (H, W) after sigmoid
+    return: tensor of peak coordinates  [K, 2]  (x, y)
+    """
+    # binary mask
+    pos = prob_map > thr
+    # non‑maximum suppression
+    nms = F.max_pool2d(
+        prob_map.unsqueeze(0).unsqueeze(0),
+        kernel_size=pool,
+        stride=1,
+        padding=pool // 2,
+    )
+    peaks = (prob_map == nms.squeeze()) & pos
+    ys, xs = torch.nonzero(peaks, as_tuple=True)
+    return torch.stack([xs, ys], dim=1)  # (K, 2)
+def pick_radius(radius_map, centers, ksize=3):
+    """
+    radius_map: (H, W) ∈ [0, 1]
+    centers: (K, 2)  x,y
+    return: (K,) radii in pixel
+    """
+    # H, W = radius_map.shape
+    pad = ksize // 2
+    padded = F.pad(
+        radius_map.unsqueeze(0).unsqueeze(0), (pad, pad, pad, pad), mode="reflect"
+    )
+    radii = []
+    for x, y in centers:
+        patch = padded[..., y : y + ksize, x : x + ksize]
+        radii.append(patch.mean())  # 3×3 mean
+    return torch.stack(radii)
+def draw_mask(centers, radii, H, W):
+    """
+    centers: (K, 2)  (x, y)
+    radii:   (K,)
+    return:  (H, W) uint8 mask
+    """
+    radii *= 256
+    mask = np.zeros((H, W), dtype=np.float32)
+    for (x, y), r in zip(centers, radii):
+        rr, cc = disk((y.item(), x.item()), r.item(), shape=mask.shape)
+        mask[rr, cc] = 1
+    return mask

weights/light_outpaint_lora/pytorch_lora_weights.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:04aeb7148ae4d8c59f0d0260ee813c2fe41a8392d826c4941dfda9ed7cf7090d
+size 3358448

weights/light_regress/model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c4e2ac2d23180814361ec04bcb22cc92adb761fb5ccc761b5c3874a297fed18
+size 85314151

weights/net_g_last.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:75f0fc77ab43703c7a9c7876621f8a651d6ce3a0cfb7c6e2377b3c8e2331b0e2
+size 82605273