Spaces:

marcosv
/

InstructIR

Running on T4

App Files Files Community

mv-lab commited on Jan 29

Commit

39417b0

•

1 Parent(s): 616408c

InstructIR x HF

Browse files

Files changed (19) hide show

.gitignore +4 -0
app.py +157 -0
configs/eval5d.yml +40 -0
images/a0010.jpg +0 -0
images/frog.png +0 -0
images/gopro.png +0 -0
images/gradio_demo_images/bear.png +0 -0
images/gradio_demo_images/city.jpg +0 -0
images/gradio_demo_images/frog.png +0 -0
images/lol_1.png +0 -0
images/lol_748.png +0 -0
images/noise50.png +0 -0
images/rain-020.png +0 -0
models/instructir.py +134 -0
models/nafnet.py +201 -0
models/nafnet_utils.py +146 -0
requirements_gradio.txt +6 -0
text/models.py +65 -0
text/sample_prompts.json +55 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+__pycache__/
+*.pt
+*.gif
+*.pth

app.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import argparse
+import gradio as gr
+from PIL import Image
+import os
+import torch
+import numpy as np
+import yaml
+#from gradio_imageslider import ImageSlider
+## local code
+from models import instructir
+from text.models import LanguageModel, LMHead
+def dict2namespace(config):
+    namespace = argparse.Namespace()
+    for key, value in config.items():
+        if isinstance(value, dict):
+            new_value = dict2namespace(value)
+        else:
+            new_value = value
+        setattr(namespace, key, new_value)
+    return namespace
+CONFIG     = "configs/eval5d.yml"
+LM_MODEL   = "models/lm_instructir-7d.pt"
+MODEL_NAME = "models/im_instructir-7d.pt"
+# parse config file
+with open(os.path.join(CONFIG), "r") as f:
+    config = yaml.safe_load(f)
+cfg = dict2namespace(config)
+device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+model = instructir.create_model(input_channels =cfg.model.in_ch, width=cfg.model.width, enc_blks = cfg.model.enc_blks,
+                            middle_blk_num = cfg.model.middle_blk_num, dec_blks = cfg.model.dec_blks, txtdim=cfg.model.textdim)
+model = model.to(device)
+print ("IMAGE MODEL CKPT:", MODEL_NAME)
+model.load_state_dict(torch.load(MODEL_NAME, map_location="cpu"), strict=True)
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+LMODEL = cfg.llm.model
+language_model = LanguageModel(model=LMODEL)
+lm_head = LMHead(embedding_dim=cfg.llm.model_dim, hidden_dim=cfg.llm.embd_dim, num_classes=cfg.llm.nclasses)
+lm_head = lm_head.to(device)
+print("LMHEAD MODEL CKPT:", LM_MODEL)
+lm_head.load_state_dict(torch.load(LM_MODEL, map_location="cpu"), strict=True)
+def load_img (filename, norm=True,):
+    img = np.array(Image.open(filename).convert("RGB"))
+    if norm:
+        img = img / 255.
+        img = img.astype(np.float32)
+    return img
+def process_img (image, prompt):
+    img = np.array(image)
+    img = img / 255.
+    img = img.astype(np.float32)
+    y = torch.tensor(img).permute(2,0,1).unsqueeze(0).to(device)
+    lm_embd = language_model(prompt)
+    lm_embd = lm_embd.to(device)
+    with torch.no_grad():
+        text_embd, deg_pred = lm_head (lm_embd)
+        x_hat = model(y, text_embd)
+    restored_img = x_hat.squeeze().permute(1,2,0).clamp_(0, 1).cpu().detach().numpy()
+    restored_img = np.clip(restored_img, 0. , 1.)
+    restored_img = (restored_img * 255.0).round().astype(np.uint8)  # float32 to uint8
+    return Image.fromarray(restored_img) #(image, Image.fromarray(restored_img))
+title = "InstructIR ✏️🖼️ 🤗"
+description = ''' ## [High-Quality Image Restoration Following Human Instructions](https://github.com/mv-lab/InstructIR)
+[Marcos V. Conde](https://scholar.google.com/citations?user=NtB1kjYAAAAJ&hl=en), [Gregor Geigle](https://scholar.google.com/citations?user=uIlyqRwAAAAJ&hl=en), [Radu Timofte](https://scholar.google.com/citations?user=u3MwH5kAAAAJ&hl=en)
+Computer Vision Lab, University of Wuerzburg | Sony PlayStation, FTG
+### TL;DR: quickstart
+InstructIR takes as input an image and a human-written instruction for how to improve that image. The neural model performs all-in-one image restoration. InstructIR achieves state-of-the-art results on several restoration tasks including image denoising, deraining, deblurring, dehazing, and (low-light) image enhancement.
+**🚀 You can start with the [demo tutorial](https://github.com/mv-lab/InstructIR/blob/main/demo.ipynb)**
+<details>
+<summary> <b> Abstract</b> (click me to read)</summary>
+<p>
+Image restoration is a fundamental problem that involves recovering a high-quality clean image from its degraded observation. All-In-One image restoration models can effectively restore images from various types and levels of degradation using degradation-specific information as prompts to guide the restoration model. In this work, we present the first approach that uses human-written instructions to guide the image restoration model. Given natural language prompts, our model can recover high-quality images from their degraded counterparts, considering multiple degradation types. Our method, InstructIR, achieves state-of-the-art results on several restoration tasks including image denoising, deraining, deblurring, dehazing, and (low-light) image enhancement. InstructIR improves +1dB over previous all-in-one restoration methods. Moreover, our dataset and results represent a novel benchmark for new research on text-guided image restoration and enhancement.
+</p>
+</details>
+> Disclaimer: please remember this is not a product, thus, you will notice some limitations.
+**This demo expects an image with some degradations (blur, noise, rain, low-light, haze) and a prompt requesting what should be done.**
+Due to the GPU memory limitations, the app might crash if you feed a high-resolution image (2K, 4K).
+<br>
+'''
+# **Demo notebook can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/Swin2SR/Perform_image_super_resolution_with_Swin2SR.ipynb).
+article = "<p style='text-align: center'><a href='https://github.com/mv-lab/InstructIR' target='_blank'>High-Quality Image Restoration Following Human Instructions</a></p>"
+examples = [['images/rain-020.png', "I love this photo, could you remove the raindrops? please keep the content intact"],
+            ['images/gradio_demo_images/city.jpg', "I took this photo during a foggy day, can you improve it?"],
+            ['images/gradio_demo_images/frog.png', "can you remove the tiny dots in the image? it is very unpleasant"],
+            ["images/lol_748.png", "my image is too dark, I cannot see anything, can you fix it?"],
+            ["images/gopro.png", "I took this photo while I was running, can you stabilize the image? it is too blurry"],
+            ["images/a0010.jpg", "please I want this image for my photo album, can you edit it as a photographer"]]
+css = """
+    .image-frame img, .image-container img {
+        width: auto;
+        height: auto;
+        max-width: none;
+    }
+"""
+demo = gr.Interface(
+    fn=process_img,
+    inputs=[
+            gr.Image(type="pil", label="Input"),
+            gr.Text(label="Prompt")
+    ],
+    outputs=[gr.Image(type="pil", label="Ouput")],   #ImageSlider(position=0.5, type="pil", label="SideBySide")], #gr.Image(type="pil", label="Ouput"),  #
+    title=title,
+    description=description,
+    article=article,
+    examples=examples,
+    css=css,
+)
+if __name__ == "__main__":
+    demo.launch()
+# with gr.Blocks() as demo:
+#     with gr.Row(equal_height=True):
+#         with gr.Column(scale=1):
+#             input = gr.Image(type="pil", label="Input")
+#         with gr.Column(scale=1):
+#             prompt = gr.Text(label="Prompt")
+#             process_btn = gr.Button("Process")
+#     with gr.Row(equal_height=True):
+#         output = gr.Image(type="pil", label="Ouput")
+#         slider = ImageSlider(position=0.5, type="pil", label="SideBySide")
+#     process_btn.click(fn=process_img, inputs=[input, prompt], outputs=[output, slider])
+# demo.launch(share=True)

configs/eval5d.yml ADDED Viewed

	@@ -0,0 +1,40 @@

+llm:
+    model: 'TaylorAI/bge-micro-v2' # See Paper Sec. 3.2 and Appendix
+    model_dim: 384
+    embd_dim: 256
+    nclasses: 7 # noise, blur, rain, haze, lol, enhancement, upsampling (Paper Sec. 4.3)
+    weights: False
+model:
+    arch: "instructir"
+    use_text: True
+    in_ch: 3
+    out_ch: 3
+    width : 32
+    enc_blks: [2, 2, 4, 8]
+    middle_blk_num: 4
+    dec_blks: [2, 2, 2, 2]
+    textdim: 256
+    weights: False
+test:
+    batch_size: 1
+    num_workers: 3
+    dn_datapath: "data/denoising_testsets/"
+    dn_datasets: ["CBSD68", "urban100", "Kodak24", "McMaster"]
+    dn_sigmas: [15, 25, 50]
+    rain_targets: ["data/Rain/rain_test/Rain100L/target/"]
+    rain_inputs:  ["data/Rain/rain_test/Rain100L/input/"]
+    haze_targets: "data/SOTS-OUT/GT/"
+    haze_inputs : "data/SOTS-OUT/IN/"
+    lol_targets: "data/LOL/eval15/high/"
+    lol_inputs : "data/LOL/eval15/low/"
+    gopro_targets: "data/gopro_test/GoPro/target/"
+    gopro_inputs: "data/gopro_test/GoPro/input/"

images/a0010.jpg ADDED Viewed

images/frog.png ADDED Viewed

images/gopro.png ADDED Viewed

images/gradio_demo_images/bear.png ADDED Viewed

images/gradio_demo_images/city.jpg ADDED Viewed

images/gradio_demo_images/frog.png ADDED Viewed

images/lol_1.png ADDED Viewed

images/lol_748.png ADDED Viewed

images/noise50.png ADDED Viewed

images/rain-020.png ADDED Viewed

models/instructir.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import init as init
+from torch.nn.modules.batchnorm import _BatchNorm
+from models.nafnet_utils import Local_Base, LayerNorm2d
+from models.nafnet import SimpleGate, NAFBlock
+class ICB(nn.Module):
+    """
+    Instruction Condition Block (ICB)
+    Paper Section 3.3
+    """
+    def __init__(self, feature_dim, text_dim=768):
+        super(ICB, self).__init__()
+        self.fc    = nn.Linear(text_dim, feature_dim)
+        self.block = NAFBlock(feature_dim)
+        self.beta  = nn.Parameter(torch.zeros((1, feature_dim, 1, 1)), requires_grad=True)
+        self.gamma = nn.Parameter(torch.zeros((1, feature_dim, 1, 1)), requires_grad=True)
+    def forward(self, x, text_embedding):
+        gating_factors = torch.sigmoid(self.fc(text_embedding))
+        gating_factors = gating_factors.unsqueeze(-1).unsqueeze(-1)
+        f = x * self.gamma + self.beta  # 1) learned feature scaling/modulation
+        f = f * gating_factors          # 2) (soft) feature routing based on text
+        f = self.block(f)               # 3) block feature enhancement
+        return f + x
+class InstructIR(nn.Module):
+    """
+    InstructIR model using NAFNet (ECCV 2022) as backbone.
+    The model takes as input an RGB image and a text embedding (encoded instruction).
+    Described in Paper Section 3.3
+    """
+    def __init__(self, img_channel=3, width=16, middle_blk_num=1, enc_blk_nums=[], dec_blk_nums=[], txtdim=768):
+        super().__init__()
+        self.intro  = nn.Conv2d(in_channels=img_channel, out_channels=width, kernel_size=3, padding=1, stride=1, groups=1,
+                              bias=True)
+        self.ending = nn.Conv2d(in_channels=width, out_channels=img_channel, kernel_size=3, padding=1, stride=1, groups=1,
+                              bias=True)
+        self.encoders    = nn.ModuleList()
+        self.decoders    = nn.ModuleList()
+        self.middle_blks = nn.ModuleList()
+        self.ups         = nn.ModuleList()
+        self.downs       = nn.ModuleList()
+        self.enc_cond    = nn.ModuleList()
+        self.dec_cond    = nn.ModuleList()
+        chan = width
+        for num in enc_blk_nums:
+            self.encoders.append(
+                nn.Sequential(
+                    *[NAFBlock(chan) for _ in range(num)]
+                )
+            )
+            self.enc_cond.append(ICB(chan, txtdim))
+            self.downs.append(
+                nn.Conv2d(chan, 2*chan, 2, 2)
+            )
+            chan = chan * 2
+        self.middle_blks = nn.Sequential(
+                *[NAFBlock(chan) for _ in range(middle_blk_num)]
+            )
+        for num in dec_blk_nums:
+            self.ups.append(
+                nn.Sequential(
+                    nn.Conv2d(chan, chan * 2, 1, bias=False),
+                    nn.PixelShuffle(2)
+                )
+            )
+            chan = chan // 2
+            self.decoders.append(
+                nn.Sequential(
+                    *[NAFBlock(chan) for _ in range(num)]
+                )
+            )
+            # Add text embedding as modulation
+            self.dec_cond.append(ICB(chan, txtdim))
+        self.padder_size = 2 ** len(self.encoders)
+    def forward(self, inp, txtembd):
+        B, C, H, W = inp.shape
+        inp = self.check_image_size(inp)
+        x = self.intro(inp)
+        encs = []
+        for encoder, enc_mod, down in zip(self.encoders, self.enc_cond, self.downs):
+            x = encoder(x)
+            x = enc_mod(x, txtembd)
+            encs.append(x)
+            x = down(x)
+        x = self.middle_blks(x)
+        for decoder, up, enc_skip, dec_mod in zip(self.decoders, self.ups, encs[::-1], self.dec_cond):
+            x = up(x)
+            x = x + enc_skip
+            x = decoder(x)
+            x = dec_mod(x, txtembd)
+        x = self.ending(x)
+        x = x + inp
+        return x[:, :, :H, :W]
+    def check_image_size(self, x):
+        _, _, h, w = x.size()
+        mod_pad_h = (self.padder_size - h % self.padder_size) % self.padder_size
+        mod_pad_w = (self.padder_size - w % self.padder_size) % self.padder_size
+        x = F.pad(x, (0, mod_pad_w, 0, mod_pad_h))
+        return x
+def create_model(input_channels = 3, width = 32, enc_blks = [2, 2, 4, 8], middle_blk_num = 12, dec_blks = [2, 2, 2, 2], txtdim=768):
+    net = InstructIR(img_channel=input_channels, width=width, middle_blk_num=middle_blk_num,
+                      enc_blk_nums=enc_blks, dec_blk_nums=dec_blks, txtdim=txtdim)
+    return net

models/nafnet.py ADDED Viewed

	@@ -0,0 +1,201 @@

+# ------------------------------------------------------------------------
+# Copyright (c) 2022 megvii-model. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Source: https://github.com/megvii-research/NAFNet
+'''
+Simple Baselines for Image Restoration
+@article{chen2022simple,
+  title={Simple Baselines for Image Restoration},
+  author={Chen, Liangyu and Chu, Xiaojie and Zhang, Xiangyu and Sun, Jian},
+  journal={arXiv preprint arXiv:2204.04676},
+  year={2022}
+}
+'''
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import init as init
+from torch.nn.modules.batchnorm import _BatchNorm
+from models.nafnet_utils import Local_Base, LayerNorm2d
+class SimpleGate(nn.Module):
+    def forward(self, x):
+        x1, x2 = x.chunk(2, dim=1)
+        return x1 * x2
+class NAFBlock(nn.Module):
+    def __init__(self, c, DW_Expand=2, FFN_Expand=2, drop_out_rate=0.):
+        super().__init__()
+        dw_channel = c * DW_Expand
+        self.conv1 = nn.Conv2d(in_channels=c, out_channels=dw_channel, kernel_size=1, padding=0, stride=1, groups=1, bias=True)
+        self.conv2 = nn.Conv2d(in_channels=dw_channel, out_channels=dw_channel, kernel_size=3, padding=1, stride=1, groups=dw_channel,
+                               bias=True)
+        self.conv3 = nn.Conv2d(in_channels=dw_channel // 2, out_channels=c, kernel_size=1, padding=0, stride=1, groups=1, bias=True)
+        # Simplified Channel Attention
+        self.sca = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(in_channels=dw_channel // 2, out_channels=dw_channel // 2, kernel_size=1, padding=0, stride=1,
+                      groups=1, bias=True),
+        )
+        # SimpleGate
+        self.sg = SimpleGate()
+        ffn_channel = FFN_Expand * c
+        self.conv4 = nn.Conv2d(in_channels=c, out_channels=ffn_channel, kernel_size=1, padding=0, stride=1, groups=1, bias=True)
+        self.conv5 = nn.Conv2d(in_channels=ffn_channel // 2, out_channels=c, kernel_size=1, padding=0, stride=1, groups=1, bias=True)
+        self.norm1 = LayerNorm2d(c)
+        self.norm2 = LayerNorm2d(c)
+        self.dropout1 = nn.Dropout(drop_out_rate) if drop_out_rate > 0. else nn.Identity()
+        self.dropout2 = nn.Dropout(drop_out_rate) if drop_out_rate > 0. else nn.Identity()
+        self.beta = nn.Parameter(torch.zeros((1, c, 1, 1)), requires_grad=True)
+        self.gamma = nn.Parameter(torch.zeros((1, c, 1, 1)), requires_grad=True)
+    def forward(self, inp):
+        x = inp
+        x = self.norm1(x)
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.sg(x)
+        x = x * self.sca(x)
+        x = self.conv3(x)
+        x = self.dropout1(x)
+        y = inp + x * self.beta
+        x = self.conv4(self.norm2(y))
+        x = self.sg(x)
+        x = self.conv5(x)
+        x = self.dropout2(x)
+        return y + x * self.gamma
+class NAFNet(nn.Module):
+    def __init__(self, img_channel=3, width=16, middle_blk_num=1, enc_blk_nums=[], dec_blk_nums=[]):
+        super().__init__()
+        self.intro = nn.Conv2d(in_channels=img_channel, out_channels=width, kernel_size=3, padding=1, stride=1, groups=1,
+                              bias=True)
+        self.ending = nn.Conv2d(in_channels=width, out_channels=img_channel, kernel_size=3, padding=1, stride=1, groups=1,
+                              bias=True)
+        self.encoders = nn.ModuleList()
+        self.decoders = nn.ModuleList()
+        self.middle_blks = nn.ModuleList()
+        self.ups = nn.ModuleList()
+        self.downs = nn.ModuleList()
+        chan = width
+        for num in enc_blk_nums:
+            self.encoders.append(
+                nn.Sequential(
+                    *[NAFBlock(chan) for _ in range(num)]
+                )
+            )
+            self.downs.append(
+                nn.Conv2d(chan, 2*chan, 2, 2)
+            )
+            chan = chan * 2
+        self.middle_blks = \
+            nn.Sequential(
+                *[NAFBlock(chan) for _ in range(middle_blk_num)]
+            )
+        for num in dec_blk_nums:
+            self.ups.append(
+                nn.Sequential(
+                    nn.Conv2d(chan, chan * 2, 1, bias=False),
+                    nn.PixelShuffle(2)
+                )
+            )
+            chan = chan // 2
+            self.decoders.append(
+                nn.Sequential(
+                    *[NAFBlock(chan) for _ in range(num)]
+                )
+            )
+        self.padder_size = 2 ** len(self.encoders)
+    def forward(self, inp):
+        B, C, H, W = inp.shape
+        inp = self.check_image_size(inp)
+        x = self.intro(inp)
+        encs = []
+        for encoder, down in zip(self.encoders, self.downs):
+            x = encoder(x)
+            encs.append(x)
+            x = down(x)
+        x = self.middle_blks(x)
+        for decoder, up, enc_skip in zip(self.decoders, self.ups, encs[::-1]):
+            x = up(x)
+            x = x + enc_skip
+            x = decoder(x)
+        x = self.ending(x)
+        x = x + inp
+        return x[:, :, :H, :W]
+    def check_image_size(self, x):
+        _, _, h, w = x.size()
+        mod_pad_h = (self.padder_size - h % self.padder_size) % self.padder_size
+        mod_pad_w = (self.padder_size - w % self.padder_size) % self.padder_size
+        x = F.pad(x, (0, mod_pad_w, 0, mod_pad_h))
+        return x
+class NAFNetLocal(Local_Base, NAFNet):
+    def __init__(self, *args, train_size=(1, 3, 256, 256), fast_imp=False, **kwargs):
+        Local_Base.__init__(self)
+        NAFNet.__init__(self, *args, **kwargs)
+        N, C, H, W = train_size
+        base_size = (int(H * 1.5), int(W * 1.5))
+        self.eval()
+        with torch.no_grad():
+            self.convert(base_size=base_size, train_size=train_size, fast_imp=fast_imp)
+def create_nafnet(input_channels = 3, width = 32, enc_blks = [2, 2, 4, 8], middle_blk_num = 12, dec_blks = [2, 2, 2, 2]):
+    """
+    Create Nafnet model
+    https://github.com/megvii-research/NAFNet/blob/main/options/test/SIDD/NAFNet-width32.yml
+    """
+    net = NAFNet(img_channel=input_channels, width=width, middle_blk_num=middle_blk_num,
+                      enc_blk_nums=enc_blks, dec_blk_nums=dec_blks)
+    # inp_shape = (3, 256, 256)
+    # from ptflops import get_model_complexity_info
+    # macs, params = get_model_complexity_info(net, inp_shape, verbose=False, print_per_layer_stat=False)
+    # params = float(params[:-3])
+    # macs = float(macs[:-4])
+    # print(macs, params)
+    return net

models/nafnet_utils.py ADDED Viewed

	@@ -0,0 +1,146 @@

+# ------------------------------------------------------------------------
+# Copyright (c) 2022 megvii-model. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Source: https://github.com/megvii-research/NAFNet
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+class LayerNormFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, weight, bias, eps):
+        ctx.eps = eps
+        N, C, H, W = x.size()
+        mu = x.mean(1, keepdim=True)
+        var = (x - mu).pow(2).mean(1, keepdim=True)
+        y = (x - mu) / (var + eps).sqrt()
+        ctx.save_for_backward(y, var, weight)
+        y = weight.view(1, C, 1, 1) * y + bias.view(1, C, 1, 1)
+        return y
+    @staticmethod
+    def backward(ctx, grad_output):
+        eps = ctx.eps
+        N, C, H, W = grad_output.size()
+        y, var, weight = ctx.saved_variables
+        g = grad_output * weight.view(1, C, 1, 1)
+        mean_g = g.mean(dim=1, keepdim=True)
+        mean_gy = (g * y).mean(dim=1, keepdim=True)
+        gx = 1. / torch.sqrt(var + eps) * (g - y * mean_gy - mean_g)
+        return gx, (grad_output * y).sum(dim=3).sum(dim=2).sum(dim=0), grad_output.sum(dim=3).sum(dim=2).sum(
+            dim=0), None
+class LayerNorm2d(nn.Module):
+    def __init__(self, channels, eps=1e-6):
+        super(LayerNorm2d, self).__init__()
+        self.register_parameter('weight', nn.Parameter(torch.ones(channels)))
+        self.register_parameter('bias', nn.Parameter(torch.zeros(channels)))
+        self.eps = eps
+    def forward(self, x):
+        return LayerNormFunction.apply(x, self.weight, self.bias, self.eps)
+class AvgPool2d(nn.Module):
+    def __init__(self, kernel_size=None, base_size=None, auto_pad=True, fast_imp=False, train_size=None):
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.base_size = base_size
+        self.auto_pad = auto_pad
+        # only used for fast implementation
+        self.fast_imp = fast_imp
+        self.rs = [5, 4, 3, 2, 1]
+        self.max_r1 = self.rs[0]
+        self.max_r2 = self.rs[0]
+        self.train_size = train_size
+    def extra_repr(self) -> str:
+        return 'kernel_size={}, base_size={}, stride={}, fast_imp={}'.format(
+            self.kernel_size, self.base_size, self.kernel_size, self.fast_imp
+        )
+    def forward(self, x):
+        if self.kernel_size is None and self.base_size:
+            train_size = self.train_size
+            if isinstance(self.base_size, int):
+                self.base_size = (self.base_size, self.base_size)
+            self.kernel_size = list(self.base_size)
+            self.kernel_size[0] = x.shape[2] * self.base_size[0] // train_size[-2]
+            self.kernel_size[1] = x.shape[3] * self.base_size[1] // train_size[-1]
+            # only used for fast implementation
+            self.max_r1 = max(1, self.rs[0] * x.shape[2] // train_size[-2])
+            self.max_r2 = max(1, self.rs[0] * x.shape[3] // train_size[-1])
+        if self.kernel_size[0] >= x.size(-2) and self.kernel_size[1] >= x.size(-1):
+            return F.adaptive_avg_pool2d(x, 1)
+        if self.fast_imp:  # Non-equivalent implementation but faster
+            h, w = x.shape[2:]
+            if self.kernel_size[0] >= h and self.kernel_size[1] >= w:
+                out = F.adaptive_avg_pool2d(x, 1)
+            else:
+                r1 = [r for r in self.rs if h % r == 0][0]
+                r2 = [r for r in self.rs if w % r == 0][0]
+                # reduction_constraint
+                r1 = min(self.max_r1, r1)
+                r2 = min(self.max_r2, r2)
+                s = x[:, :, ::r1, ::r2].cumsum(dim=-1).cumsum(dim=-2)
+                n, c, h, w = s.shape
+                k1, k2 = min(h - 1, self.kernel_size[0] // r1), min(w - 1, self.kernel_size[1] // r2)
+                out = (s[:, :, :-k1, :-k2] - s[:, :, :-k1, k2:] - s[:, :, k1:, :-k2] + s[:, :, k1:, k2:]) / (k1 * k2)
+                out = torch.nn.functional.interpolate(out, scale_factor=(r1, r2))
+        else:
+            n, c, h, w = x.shape
+            s = x.cumsum(dim=-1).cumsum_(dim=-2)
+            s = torch.nn.functional.pad(s, (1, 0, 1, 0))  # pad 0 for convenience
+            k1, k2 = min(h, self.kernel_size[0]), min(w, self.kernel_size[1])
+            s1, s2, s3, s4 = s[:, :, :-k1, :-k2], s[:, :, :-k1, k2:], s[:, :, k1:, :-k2], s[:, :, k1:, k2:]
+            out = s4 + s1 - s2 - s3
+            out = out / (k1 * k2)
+        if self.auto_pad:
+            n, c, h, w = x.shape
+            _h, _w = out.shape[2:]
+            # print(x.shape, self.kernel_size)
+            pad2d = ((w - _w) // 2, (w - _w + 1) // 2, (h - _h) // 2, (h - _h + 1) // 2)
+            out = torch.nn.functional.pad(out, pad2d, mode='replicate')
+        return out
+def replace_layers(model, base_size, train_size, fast_imp, **kwargs):
+    for n, m in model.named_children():
+        if len(list(m.children())) > 0:
+            ## compound module, go inside it
+            replace_layers(m, base_size, train_size, fast_imp, **kwargs)
+        if isinstance(m, nn.AdaptiveAvgPool2d):
+            pool = AvgPool2d(base_size=base_size, fast_imp=fast_imp, train_size=train_size)
+            assert m.output_size == 1
+            setattr(model, n, pool)
+'''
+ref.
+@article{chu2021tlsc,
+  title={Revisiting Global Statistics Aggregation for Improving Image Restoration},
+  author={Chu, Xiaojie and Chen, Liangyu and and Chen, Chengpeng and Lu, Xin},
+  journal={arXiv preprint arXiv:2112.04491},
+  year={2021}
+}
+'''
+class Local_Base():
+    def convert(self, *args, train_size, **kwargs):
+        replace_layers(self, *args, train_size=train_size, **kwargs)
+        imgs = torch.rand(train_size)
+        with torch.no_grad():
+            self.forward(imgs)

requirements_gradio.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch
+transformers
+Pillow>=6.2.2
+sentence-transformers==2.3.0
+gradio==4.16.0
+#gradio_imageslider==0.0.18

text/models.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+from torch import nn
+import torch.nn.functional as F
+from transformers import DistilBertModel, DistilBertTokenizer, AutoModel, AutoTokenizer
+import os
+# Models that use mean pooling
+POOL_MODELS = {"sentence-transformers/all-MiniLM-L6-v2", "TaylorAI/bge-micro-v2"}
+#Mean Pooling - Take attention mask into account for correct averaging
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+class LanguageModel(nn.Module):
+    def __init__(self, model='distilbert-base-uncased'):
+        super(LanguageModel, self).__init__()
+        self.tokenizer = AutoTokenizer.from_pretrained(model)
+        self.model = AutoModel.from_pretrained(model)
+        self.model_name = model
+        # Remove the CLIP vision tower
+        if "clip" in self.model_name:
+            self.model.vision_model = None
+        # Freeze the pre-trained parameters (very important)
+        for param in self.model.parameters():
+            param.requires_grad = False
+        # Make sure to set evaluation mode (also important)
+        self.model.eval()
+    def forward(self, text_batch):
+        inputs = self.tokenizer(text_batch, padding=True, truncation=True, return_tensors="pt")
+        with torch.no_grad(): # Ensure no gradients are computed for this forward pass
+            if "clip" in self.model_name:
+                sentence_embedding = self.model.get_text_features(**inputs)
+                return sentence_embedding
+            outputs = self.model(**inputs)
+        if any(model in self.model_name for model in POOL_MODELS):
+            sentence_embeddings = mean_pooling(outputs, inputs['attention_mask'])
+            # Normalize embeddings
+            sentence_embedding = F.normalize(sentence_embeddings, p=2, dim=1)
+        else:
+            sentence_embedding = outputs.last_hidden_state[:, 0, :]
+        return sentence_embedding
+class LMHead(nn.Module):
+    def __init__(self, embedding_dim=384, hidden_dim=256, num_classes=4):
+        super(LMHead, self).__init__()
+        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
+        #self.gelu = nn.GELU()
+        self.fc2 = nn.Linear(hidden_dim, num_classes)
+    def forward(self, x):
+        embd = self.fc1(x)
+        embd = F.normalize(embd, p=2, dim=1)
+        deg_pred = self.fc2(embd)
+        return embd, deg_pred

text/sample_prompts.json ADDED Viewed

	@@ -0,0 +1,55 @@

+{
+  "denoising": [
+    "Help me reduce the fuzziness in this image.",
+    "I need this image denoised ASAP.",
+    "Clean up this noisy image, it's an eyesore.",
+    "Can you clean the dots from my image?",
+    "Help me with my picture, it's full of tiny spots.",
+    "Clean up this image, it's all grainy."
+  ],
+  "deblurring": [
+    "Please, clean up this blurry photo.",
+    "My picture's not sharp, fix it.",
+    "Deblur my picture, it's too fuzzy.",
+    "Help, my photo is too blurry.",
+    "Please, make my image less smudgy."
+  ],
+  "dehazing": [
+    "Please, fix the haziness in my image.",
+    "I need to remove the haziness from this image.",
+    "Get rid of the fog in my image.",
+    "Fix my photo, it's too misty.",
+    "Help me, my photo is all hazy."
+  ],
+  "deraining": [
+    "I want to eliminate the water from this image.",
+    "Clear the rain from my picture.",
+    "I need to clear the rain from this image.",
+    "Can you get rid of the raindrops in my picture?"
+  ],
+  "sr": [
+    "I need to enhance the size and quality of this image.",
+    "My photo is lacking size and clarity; can you improve it?",
+    "I'd appreciate it if you could upscale this photo.",
+    "My picture is too little, enlarge it."
+  ],
+  "ambiguous": [
+    "Please, clear up the mess on this image.",
+    "I want this image to look good.",
+    "make it pop",
+    "Fix my photo, it's all messed up."
+  ],
+  "lol": [
+    "I took this photo during night, enhance it",
+    "The photo is too dark, improve exposure",
+    "my image has poor lighting conditions, can you fix it?",
+    "Can you make the image brighter?"
+  ],
+  "enhancement": [
+    "make my image look like DSLR",
+    "improve the colors of my image",
+    "enhance the colors of the image",
+    "Can you edit this to look like an award-winning photo?",
+    "I want the picture to be retouched for a professional portfolio."
+  ]
+}