Co-Instruct

Running on A10G

App Files Files Community

haoning.wu commited on Feb 1

Commit

f8ea2c9

•

1 Parent(s): bf92928

Add InstructIR plugin!

Browse files

Files changed (22) hide show

app.py +101 -5
eval5d.yml +40 -0
examples/211.jpg +0 -0
examples/extreme_ironing.jpg +0 -0
examples/frog.png +3 -0
examples/gopro.png +3 -0
examples/lol_748.png +3 -0
examples/noise50.png +3 -0
examples/sausage.jpg +0 -0
insir_models/.ipynb_checkpoints/instructir-checkpoint.py +134 -0
insir_models/.ipynb_checkpoints/nafnet-checkpoint.py +201 -0
insir_models/.ipynb_checkpoints/nafnet_utils-checkpoint.py +146 -0
insir_models/__pycache__/instructir.cpython-39.pyc +0 -0
insir_models/__pycache__/nafnet.cpython-39.pyc +0 -0
insir_models/__pycache__/nafnet_utils.cpython-39.pyc +0 -0
insir_models/instructir.py +134 -0
insir_models/nafnet.py +201 -0
insir_models/nafnet_utils.py +146 -0
insir_text/.ipynb_checkpoints/models-checkpoint.py +65 -0
insir_text/__pycache__/models.cpython-39.pyc +0 -0
insir_text/models.py +65 -0
insir_text/sample_prompts.json +55 -0

app.py CHANGED Viewed

@@ -1,10 +1,85 @@
 import gradio as gr
 import requests
 from PIL import Image
 import torch
 from transformers import AutoModelForCausalLM
 model = AutoModelForCausalLM.from_pretrained("q-future/co-instruct-preview",
                                              trust_remote_code=True,
                                              torch_dtype=torch.float16,
@@ -15,7 +90,7 @@ def chat(message, history, image_1, image_2, image_3, image_4):
     print(history)
     if history:
         if image_1 is not None and image_2 is None:
-            past_message = "USER: The image: <|image|> " + history[0][0] + " ASSISTANT:" + history[0][1]
             for i in range((len(history) - 1)):
                 past_message += "USER:" +history[i][0] + " ASSISTANT:" + history[i][1] + "</s>"
             message = past_message + "USER:" + message + " ASSISTANT:"
@@ -42,7 +117,7 @@ def chat(message, history, image_1, image_2, image_3, image_4):
                     images = [image_1, image_2, image_3, image_4]
     else:
         if image_1 is not None and image_2 is None:
-            message = "USER: The image: <|image|> " + message + " ASSISTANT:"
             images = [image_1]
         if image_1 is not None and image_2 is not None:
             if image_3 is None:
@@ -58,14 +133,24 @@ def chat(message, history, image_1, image_2, image_3, image_4):
     print(message)
-    return model.tokenizer.batch_decode(model.chat(message, images, max_new_tokens=300).clamp(0, 100000))[0].split("ASSISTANT:")[-1]
 with gr.Blocks(title="img") as demo:
     title_markdown = ("""
-    <h3 align="center">*Super Version of Q-Instruct with Multi-image (up to 4, same as GPT-4V) Support!*</h3>
 <h1 align="center"><a href="https://github.com/Q-Future/Q-Instruct"><img src="https://github.com/Q-Future/Q-Instruct/blob/main/q_instruct_logo.png?raw=true", alt="Q-Instruct (mPLUG-Owl-2)" border="0" style="margin: 0 auto; height: 85px;" /></a> </h1>
 <h2 align="center">Q-Instruct: Improving Low-level Visual Abilities for Multi-modality Foundation Models</h2>
 <h5 align="center"> Please find our more accurate visual scoring demo on <a href='https://huggingface.co/spaces/teowu/OneScorer'>[OneScorer]</a>!</h2>
 <div align="center">
     <div style="display:flex; gap: 0.25rem;" align="center">
@@ -81,5 +166,16 @@ with gr.Blocks(title="img") as demo:
             input_img_2 = gr.Image(type='pil', label="Image 2 (Second image)")
             input_img_3 = gr.Image(type='pil', label="Image 3 (Third image)")
             input_img_4 = gr.Image(type='pil', label="Image 4 (Third image)")
-    gr.ChatInterface(fn = chat, additional_inputs=[input_img_1, input_img_2, input_img_3, input_img_4])
     demo.launch(share=True)

+import os, yaml
 import gradio as gr
 import requests
+import argparse
 from PIL import Image
+import numpy as np
 import torch
 from transformers import AutoModelForCausalLM
+from huggingface_hub import hf_hub_download
+## InstructIR Plugin ##
+from insir_models import instructir
+from insir_text.models import LanguageModel, LMHead
+hf_hub_download(repo_id="marcosv/InstructIR", filename="im_instructir-7d.pt", local_dir="./")
+hf_hub_download(repo_id="marcosv/InstructIR", filename="lm_instructir-7d.pt", local_dir="./")
+CONFIG     = "eval5d.yml"
+LM_MODEL   = "lm_instructir-7d.pt"
+MODEL_NAME = "im_instructir-7d.pt"
+def dict2namespace(config):
+    namespace = argparse.Namespace()
+    for key, value in config.items():
+        if isinstance(value, dict):
+            new_value = dict2namespace(value)
+        else:
+            new_value = value
+        setattr(namespace, key, new_value)
+    return namespace
+# parse config file
+with open(os.path.join(CONFIG), "r") as f:
+    config = yaml.safe_load(f)
+cfg = dict2namespace(config)
+device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+ir_model = instructir.create_model(input_channels =cfg.model.in_ch, width=cfg.model.width, enc_blks = cfg.model.enc_blks,
+                            middle_blk_num = cfg.model.middle_blk_num, dec_blks = cfg.model.dec_blks, txtdim=cfg.model.textdim)
+ir_model = ir_model.to(device)
+print ("IMAGE MODEL CKPT:", MODEL_NAME)
+ir_model.load_state_dict(torch.load(MODEL_NAME, map_location="cpu"), strict=True)
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+LMODEL = cfg.llm.model
+language_model = LanguageModel(model=LMODEL)
+lm_head = LMHead(embedding_dim=cfg.llm.model_dim, hidden_dim=cfg.llm.embd_dim, num_classes=cfg.llm.nclasses)
+lm_head = lm_head.to(device)
+print("LMHEAD MODEL CKPT:", LM_MODEL)
+lm_head.load_state_dict(torch.load(LM_MODEL, map_location="cpu"), strict=True)
+def process_img(image, prompt=None):
+    if prompt is None:
+        prompt = chat("How to improve the quality of the image?", [], image, None, None, None)
+        prompt += "Please help me improve its quality!"
+        print(prompt)
+    img = np.array(image)
+    img = img / 255.
+    img = img.astype(np.float32)
+    y = torch.tensor(img).permute(2,0,1).unsqueeze(0).to(device)
+    lm_embd = language_model(prompt)
+    lm_embd = lm_embd.to(device)
+    with torch.no_grad():
+        text_embd, deg_pred = lm_head(lm_embd)
+        x_hat = ir_model(y, text_embd)
+    restored_img = x_hat.squeeze().permute(1,2,0).clamp_(0, 1).cpu().detach().numpy()
+    restored_img = np.clip(restored_img, 0. , 1.)
+    restored_img = (restored_img * 255.0).round().astype(np.uint8)  # float32 to uint8
+    return Image.fromarray(restored_img) #(image, Image.fromarray(restored_img))
+## InstructIR Plugin ##
 model = AutoModelForCausalLM.from_pretrained("q-future/co-instruct-preview",
                                              trust_remote_code=True,
                                              torch_dtype=torch.float16,
     print(history)
     if history:
         if image_1 is not None and image_2 is None:
+            past_message = "USER: The input image: <|image|>" + history[0][0] + " ASSISTANT:" + history[0][1]
             for i in range((len(history) - 1)):
                 past_message += "USER:" +history[i][0] + " ASSISTANT:" + history[i][1] + "</s>"
             message = past_message + "USER:" + message + " ASSISTANT:"
                     images = [image_1, image_2, image_3, image_4]
     else:
         if image_1 is not None and image_2 is None:
+            message = "USER: The input image: <|image|>" + message + " ASSISTANT:"
             images = [image_1]
         if image_1 is not None and image_2 is not None:
             if image_3 is None:
     print(message)
+    return model.tokenizer.batch_decode(model.chat(message, images, max_new_tokens=600).clamp(0, 100000))[0].split("ASSISTANT:")[-1]
+#### Image,Prompts examples
+examples = [
+    ["Which part of the image is relatively clearer, the upper part or the lower part? Please analyze in details.", Image.open("examples/sausage.jpg"), None],
+    ["Which image is noisy, and which one is with motion blur? Please analyze in details.", Image.open("examples/211.jpg"), Image.open("examples/frog.png")],
+            ["What is the problem in this image, and how to fix it? Please answer my questions one by one.", Image.open("examples/lol_748.png"), None],
+]
+title = "Q-Instruct🧑‍🏫"
 with gr.Blocks(title="img") as demo:
     title_markdown = ("""
 <h1 align="center"><a href="https://github.com/Q-Future/Q-Instruct"><img src="https://github.com/Q-Future/Q-Instruct/blob/main/q_instruct_logo.png?raw=true", alt="Q-Instruct (mPLUG-Owl-2)" border="0" style="margin: 0 auto; height: 85px;" /></a> </h1>
 <h2 align="center">Q-Instruct: Improving Low-level Visual Abilities for Multi-modality Foundation Models</h2>
+<div align="center">Super Version of Q-Instruct with Multi-image (up to 4, same as GPT-4V) Support! We also support <a href='https://huggingface.co/marcosv/InstructIR'>InstructIR</a> as PLUGIN!</div>
 <h5 align="center"> Please find our more accurate visual scoring demo on <a href='https://huggingface.co/spaces/teowu/OneScorer'>[OneScorer]</a>!</h2>
 <div align="center">
     <div style="display:flex; gap: 0.25rem;" align="center">
             input_img_2 = gr.Image(type='pil', label="Image 2 (Second image)")
             input_img_3 = gr.Image(type='pil', label="Image 3 (Third image)")
             input_img_4 = gr.Image(type='pil', label="Image 4 (Third image)")
+    with gr.Row():
+        with gr.Column(scale=2):
+            gr.ChatInterface(fn = chat, additional_inputs=[input_img_1, input_img_2, input_img_3, input_img_4], examples=examples)
+        with gr.Column(scale=1):
+            input_image_ir = gr.Image(type="pil", label="Image for Auto Restoration")
+            output_image_ir = gr.Image(type="pil", label="Output of Auto Restoration")
+            gr.Interface(
+                fn=process_img,
+                inputs=[input_image_ir],
+                outputs=[output_image_ir],
+                examples=[Image.open("examples/gopro.png"), Image.open("examples/noise50.png"), Image.open("examples/lol_748.png")],
+            )
     demo.launch(share=True)

eval5d.yml ADDED Viewed

	@@ -0,0 +1,40 @@

+llm:
+    model: 'TaylorAI/bge-micro-v2' # See Paper Sec. 3.2 and Appendix
+    model_dim: 384
+    embd_dim: 256
+    nclasses: 7 # noise, blur, rain, haze, lol, enhancement, upsampling (Paper Sec. 4.3)
+    weights: False
+model:
+    arch: "instructir"
+    use_text: True
+    in_ch: 3
+    out_ch: 3
+    width : 32
+    enc_blks: [2, 2, 4, 8]
+    middle_blk_num: 4
+    dec_blks: [2, 2, 2, 2]
+    textdim: 256
+    weights: False
+test:
+    batch_size: 1
+    num_workers: 3
+    dn_datapath: "data/denoising_testsets/"
+    dn_datasets: ["CBSD68", "urban100", "Kodak24", "McMaster"]
+    dn_sigmas: [15, 25, 50]
+    rain_targets: ["data/Rain/rain_test/Rain100L/target/"]
+    rain_inputs:  ["data/Rain/rain_test/Rain100L/input/"]
+    haze_targets: "data/SOTS-OUT/GT/"
+    haze_inputs : "data/SOTS-OUT/IN/"
+    lol_targets: "data/LOL/eval15/high/"
+    lol_inputs : "data/LOL/eval15/low/"
+    gopro_targets: "data/gopro_test/GoPro/target/"
+    gopro_inputs: "data/gopro_test/GoPro/input/"

examples/211.jpg CHANGED Viewed

Git LFS Details

SHA256: 7980c3c75b6eccd5519918344d03c6e8ba654f3faab2a4aae96e3baddd649a18
Pointer size: 130 Bytes
Size of remote file: 43.2 kB

examples/extreme_ironing.jpg CHANGED Viewed

Git LFS Details

SHA256: a54caa21bc513ed25c8ca7f5747555c05dfd4e33f6a3cf5c08b3d9138a4da1d9
Pointer size: 130 Bytes
Size of remote file: 62.6 kB

examples/frog.png ADDED Viewed

Git LFS Details

SHA256: 36adda1ff6c39824e480eb92583ca3e2ceea29d9cb206cca880781a102611b11
Pointer size: 131 Bytes
Size of remote file: 140 kB

examples/gopro.png ADDED Viewed

Git LFS Details

SHA256: 2b844eac02ac3499bea0dbccb382e8d4caea026ec6d2092d375e6d4c09f17b09
Pointer size: 131 Bytes
Size of remote file: 388 kB

examples/lol_748.png ADDED Viewed

Git LFS Details

SHA256: 325c720df5669e37b9f192bfa9a60add144b82e5e68d9f684c0010a0047b0056
Pointer size: 131 Bytes
Size of remote file: 335 kB

examples/noise50.png ADDED Viewed

Git LFS Details

SHA256: fa84462babeaafdebae7709f71fc048f415e2abeb4e263c69f908265923f3301
Pointer size: 131 Bytes
Size of remote file: 251 kB

examples/sausage.jpg CHANGED Viewed

Git LFS Details

SHA256: f5808fb71099077067cf92b3e4bbd8ddc4c179fa575091ff69dca9c96c175741
Pointer size: 131 Bytes
Size of remote file: 491 kB

insir_models/.ipynb_checkpoints/instructir-checkpoint.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import init as init
+from torch.nn.modules.batchnorm import _BatchNorm
+from insir_models.nafnet_utils import Local_Base, LayerNorm2d
+from insir_models.nafnet import SimpleGate, NAFBlock
+class ICB(nn.Module):
+    """
+    Instruction Condition Block (ICB)
+    Paper Section 3.3
+    """
+    def __init__(self, feature_dim, text_dim=768):
+        super(ICB, self).__init__()
+        self.fc    = nn.Linear(text_dim, feature_dim)
+        self.block = NAFBlock(feature_dim)
+        self.beta  = nn.Parameter(torch.zeros((1, feature_dim, 1, 1)), requires_grad=True)
+        self.gamma = nn.Parameter(torch.zeros((1, feature_dim, 1, 1)), requires_grad=True)
+    def forward(self, x, text_embedding):
+        gating_factors = torch.sigmoid(self.fc(text_embedding))
+        gating_factors = gating_factors.unsqueeze(-1).unsqueeze(-1)
+        f = x * self.gamma + self.beta  # 1) learned feature scaling/modulation
+        f = f * gating_factors          # 2) (soft) feature routing based on text
+        f = self.block(f)               # 3) block feature enhancement
+        return f + x
+class InstructIR(nn.Module):
+    """
+    InstructIR model using NAFNet (ECCV 2022) as backbone.
+    The model takes as input an RGB image and a text embedding (encoded instruction).
+    Described in Paper Section 3.3
+    """
+    def __init__(self, img_channel=3, width=16, middle_blk_num=1, enc_blk_nums=[], dec_blk_nums=[], txtdim=768):
+        super().__init__()
+        self.intro  = nn.Conv2d(in_channels=img_channel, out_channels=width, kernel_size=3, padding=1, stride=1, groups=1,
+                              bias=True)
+        self.ending = nn.Conv2d(in_channels=width, out_channels=img_channel, kernel_size=3, padding=1, stride=1, groups=1,
+                              bias=True)
+        self.encoders    = nn.ModuleList()
+        self.decoders    = nn.ModuleList()
+        self.middle_blks = nn.ModuleList()
+        self.ups         = nn.ModuleList()
+        self.downs       = nn.ModuleList()
+        self.enc_cond    = nn.ModuleList()
+        self.dec_cond    = nn.ModuleList()
+        chan = width
+        for num in enc_blk_nums:
+            self.encoders.append(
+                nn.Sequential(
+                    *[NAFBlock(chan) for _ in range(num)]
+                )
+            )
+            self.enc_cond.append(ICB(chan, txtdim))
+            self.downs.append(
+                nn.Conv2d(chan, 2*chan, 2, 2)
+            )
+            chan = chan * 2
+        self.middle_blks = nn.Sequential(
+                *[NAFBlock(chan) for _ in range(middle_blk_num)]
+            )
+        for num in dec_blk_nums:
+            self.ups.append(
+                nn.Sequential(
+                    nn.Conv2d(chan, chan * 2, 1, bias=False),
+                    nn.PixelShuffle(2)
+                )
+            )
+            chan = chan // 2
+            self.decoders.append(
+                nn.Sequential(
+                    *[NAFBlock(chan) for _ in range(num)]
+                )
+            )
+            # Add text embedding as modulation
+            self.dec_cond.append(ICB(chan, txtdim))
+        self.padder_size = 2 ** len(self.encoders)
+    def forward(self, inp, txtembd):
+        B, C, H, W = inp.shape
+        inp = self.check_image_size(inp)
+        x = self.intro(inp)
+        encs = []
+        for encoder, enc_mod, down in zip(self.encoders, self.enc_cond, self.downs):
+            x = encoder(x)
+            x = enc_mod(x, txtembd)
+            encs.append(x)
+            x = down(x)
+        x = self.middle_blks(x)
+        for decoder, up, enc_skip, dec_mod in zip(self.decoders, self.ups, encs[::-1], self.dec_cond):
+            x = up(x)
+            x = x + enc_skip
+            x = decoder(x)
+            x = dec_mod(x, txtembd)
+        x = self.ending(x)
+        x = x + inp
+        return x[:, :, :H, :W]
+    def check_image_size(self, x):
+        _, _, h, w = x.size()
+        mod_pad_h = (self.padder_size - h % self.padder_size) % self.padder_size
+        mod_pad_w = (self.padder_size - w % self.padder_size) % self.padder_size
+        x = F.pad(x, (0, mod_pad_w, 0, mod_pad_h))
+        return x
+def create_model(input_channels = 3, width = 32, enc_blks = [2, 2, 4, 8], middle_blk_num = 12, dec_blks = [2, 2, 2, 2], txtdim=768):
+    net = InstructIR(img_channel=input_channels, width=width, middle_blk_num=middle_blk_num,
+                      enc_blk_nums=enc_blks, dec_blk_nums=dec_blks, txtdim=txtdim)
+    return net

insir_models/.ipynb_checkpoints/nafnet-checkpoint.py ADDED Viewed

	@@ -0,0 +1,201 @@

+# ------------------------------------------------------------------------
+# Copyright (c) 2022 megvii-model. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Source: https://github.com/megvii-research/NAFNet
+'''
+Simple Baselines for Image Restoration
+@article{chen2022simple,
+  title={Simple Baselines for Image Restoration},
+  author={Chen, Liangyu and Chu, Xiaojie and Zhang, Xiangyu and Sun, Jian},
+  journal={arXiv preprint arXiv:2204.04676},
+  year={2022}
+}
+'''
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import init as init
+from torch.nn.modules.batchnorm import _BatchNorm
+from insir_models.nafnet_utils import Local_Base, LayerNorm2d
+class SimpleGate(nn.Module):
+    def forward(self, x):
+        x1, x2 = x.chunk(2, dim=1)
+        return x1 * x2
+class NAFBlock(nn.Module):
+    def __init__(self, c, DW_Expand=2, FFN_Expand=2, drop_out_rate=0.):
+        super().__init__()
+        dw_channel = c * DW_Expand
+        self.conv1 = nn.Conv2d(in_channels=c, out_channels=dw_channel, kernel_size=1, padding=0, stride=1, groups=1, bias=True)
+        self.conv2 = nn.Conv2d(in_channels=dw_channel, out_channels=dw_channel, kernel_size=3, padding=1, stride=1, groups=dw_channel,
+                               bias=True)
+        self.conv3 = nn.Conv2d(in_channels=dw_channel // 2, out_channels=c, kernel_size=1, padding=0, stride=1, groups=1, bias=True)
+        # Simplified Channel Attention
+        self.sca = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(in_channels=dw_channel // 2, out_channels=dw_channel // 2, kernel_size=1, padding=0, stride=1,
+                      groups=1, bias=True),
+        )
+        # SimpleGate
+        self.sg = SimpleGate()
+        ffn_channel = FFN_Expand * c
+        self.conv4 = nn.Conv2d(in_channels=c, out_channels=ffn_channel, kernel_size=1, padding=0, stride=1, groups=1, bias=True)
+        self.conv5 = nn.Conv2d(in_channels=ffn_channel // 2, out_channels=c, kernel_size=1, padding=0, stride=1, groups=1, bias=True)
+        self.norm1 = LayerNorm2d(c)
+        self.norm2 = LayerNorm2d(c)
+        self.dropout1 = nn.Dropout(drop_out_rate) if drop_out_rate > 0. else nn.Identity()
+        self.dropout2 = nn.Dropout(drop_out_rate) if drop_out_rate > 0. else nn.Identity()
+        self.beta = nn.Parameter(torch.zeros((1, c, 1, 1)), requires_grad=True)
+        self.gamma = nn.Parameter(torch.zeros((1, c, 1, 1)), requires_grad=True)
+    def forward(self, inp):
+        x = inp
+        x = self.norm1(x)
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.sg(x)
+        x = x * self.sca(x)
+        x = self.conv3(x)
+        x = self.dropout1(x)
+        y = inp + x * self.beta
+        x = self.conv4(self.norm2(y))
+        x = self.sg(x)
+        x = self.conv5(x)
+        x = self.dropout2(x)
+        return y + x * self.gamma
+class NAFNet(nn.Module):
+    def __init__(self, img_channel=3, width=16, middle_blk_num=1, enc_blk_nums=[], dec_blk_nums=[]):
+        super().__init__()
+        self.intro = nn.Conv2d(in_channels=img_channel, out_channels=width, kernel_size=3, padding=1, stride=1, groups=1,
+                              bias=True)
+        self.ending = nn.Conv2d(in_channels=width, out_channels=img_channel, kernel_size=3, padding=1, stride=1, groups=1,
+                              bias=True)
+        self.encoders = nn.ModuleList()
+        self.decoders = nn.ModuleList()
+        self.middle_blks = nn.ModuleList()
+        self.ups = nn.ModuleList()
+        self.downs = nn.ModuleList()
+        chan = width
+        for num in enc_blk_nums:
+            self.encoders.append(
+                nn.Sequential(
+                    *[NAFBlock(chan) for _ in range(num)]
+                )
+            )
+            self.downs.append(
+                nn.Conv2d(chan, 2*chan, 2, 2)
+            )
+            chan = chan * 2
+        self.middle_blks = \
+            nn.Sequential(
+                *[NAFBlock(chan) for _ in range(middle_blk_num)]
+            )
+        for num in dec_blk_nums:
+            self.ups.append(
+                nn.Sequential(
+                    nn.Conv2d(chan, chan * 2, 1, bias=False),
+                    nn.PixelShuffle(2)
+                )
+            )
+            chan = chan // 2
+            self.decoders.append(
+                nn.Sequential(
+                    *[NAFBlock(chan) for _ in range(num)]
+                )
+            )
+        self.padder_size = 2 ** len(self.encoders)
+    def forward(self, inp):
+        B, C, H, W = inp.shape
+        inp = self.check_image_size(inp)
+        x = self.intro(inp)
+        encs = []
+        for encoder, down in zip(self.encoders, self.downs):
+            x = encoder(x)
+            encs.append(x)
+            x = down(x)
+        x = self.middle_blks(x)
+        for decoder, up, enc_skip in zip(self.decoders, self.ups, encs[::-1]):
+            x = up(x)
+            x = x + enc_skip
+            x = decoder(x)
+        x = self.ending(x)
+        x = x + inp
+        return x[:, :, :H, :W]
+    def check_image_size(self, x):
+        _, _, h, w = x.size()
+        mod_pad_h = (self.padder_size - h % self.padder_size) % self.padder_size
+        mod_pad_w = (self.padder_size - w % self.padder_size) % self.padder_size
+        x = F.pad(x, (0, mod_pad_w, 0, mod_pad_h))
+        return x
+class NAFNetLocal(Local_Base, NAFNet):
+    def __init__(self, *args, train_size=(1, 3, 256, 256), fast_imp=False, **kwargs):
+        Local_Base.__init__(self)
+        NAFNet.__init__(self, *args, **kwargs)
+        N, C, H, W = train_size
+        base_size = (int(H * 1.5), int(W * 1.5))
+        self.eval()
+        with torch.no_grad():
+            self.convert(base_size=base_size, train_size=train_size, fast_imp=fast_imp)
+def create_nafnet(input_channels = 3, width = 32, enc_blks = [2, 2, 4, 8], middle_blk_num = 12, dec_blks = [2, 2, 2, 2]):
+    """
+    Create Nafnet model
+    https://github.com/megvii-research/NAFNet/blob/main/options/test/SIDD/NAFNet-width32.yml
+    """
+    net = NAFNet(img_channel=input_channels, width=width, middle_blk_num=middle_blk_num,
+                      enc_blk_nums=enc_blks, dec_blk_nums=dec_blks)
+    # inp_shape = (3, 256, 256)
+    # from ptflops import get_model_complexity_info
+    # macs, params = get_model_complexity_info(net, inp_shape, verbose=False, print_per_layer_stat=False)
+    # params = float(params[:-3])
+    # macs = float(macs[:-4])
+    # print(macs, params)
+    return net

insir_models/.ipynb_checkpoints/nafnet_utils-checkpoint.py ADDED Viewed

	@@ -0,0 +1,146 @@

+# ------------------------------------------------------------------------
+# Copyright (c) 2022 megvii-model. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Source: https://github.com/megvii-research/NAFNet
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+class LayerNormFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, weight, bias, eps):
+        ctx.eps = eps
+        N, C, H, W = x.size()
+        mu = x.mean(1, keepdim=True)
+        var = (x - mu).pow(2).mean(1, keepdim=True)
+        y = (x - mu) / (var + eps).sqrt()
+        ctx.save_for_backward(y, var, weight)
+        y = weight.view(1, C, 1, 1) * y + bias.view(1, C, 1, 1)
+        return y
+    @staticmethod
+    def backward(ctx, grad_output):
+        eps = ctx.eps
+        N, C, H, W = grad_output.size()
+        y, var, weight = ctx.saved_variables
+        g = grad_output * weight.view(1, C, 1, 1)
+        mean_g = g.mean(dim=1, keepdim=True)
+        mean_gy = (g * y).mean(dim=1, keepdim=True)
+        gx = 1. / torch.sqrt(var + eps) * (g - y * mean_gy - mean_g)
+        return gx, (grad_output * y).sum(dim=3).sum(dim=2).sum(dim=0), grad_output.sum(dim=3).sum(dim=2).sum(
+            dim=0), None
+class LayerNorm2d(nn.Module):
+    def __init__(self, channels, eps=1e-6):
+        super(LayerNorm2d, self).__init__()
+        self.register_parameter('weight', nn.Parameter(torch.ones(channels)))
+        self.register_parameter('bias', nn.Parameter(torch.zeros(channels)))
+        self.eps = eps
+    def forward(self, x):
+        return LayerNormFunction.apply(x, self.weight, self.bias, self.eps)
+class AvgPool2d(nn.Module):
+    def __init__(self, kernel_size=None, base_size=None, auto_pad=True, fast_imp=False, train_size=None):
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.base_size = base_size
+        self.auto_pad = auto_pad
+        # only used for fast implementation
+        self.fast_imp = fast_imp
+        self.rs = [5, 4, 3, 2, 1]
+        self.max_r1 = self.rs[0]
+        self.max_r2 = self.rs[0]
+        self.train_size = train_size
+    def extra_repr(self) -> str:
+        return 'kernel_size={}, base_size={}, stride={}, fast_imp={}'.format(
+            self.kernel_size, self.base_size, self.kernel_size, self.fast_imp
+        )
+    def forward(self, x):
+        if self.kernel_size is None and self.base_size:
+            train_size = self.train_size
+            if isinstance(self.base_size, int):
+                self.base_size = (self.base_size, self.base_size)
+            self.kernel_size = list(self.base_size)
+            self.kernel_size[0] = x.shape[2] * self.base_size[0] // train_size[-2]
+            self.kernel_size[1] = x.shape[3] * self.base_size[1] // train_size[-1]
+            # only used for fast implementation
+            self.max_r1 = max(1, self.rs[0] * x.shape[2] // train_size[-2])
+            self.max_r2 = max(1, self.rs[0] * x.shape[3] // train_size[-1])
+        if self.kernel_size[0] >= x.size(-2) and self.kernel_size[1] >= x.size(-1):
+            return F.adaptive_avg_pool2d(x, 1)
+        if self.fast_imp:  # Non-equivalent implementation but faster
+            h, w = x.shape[2:]
+            if self.kernel_size[0] >= h and self.kernel_size[1] >= w:
+                out = F.adaptive_avg_pool2d(x, 1)
+            else:
+                r1 = [r for r in self.rs if h % r == 0][0]
+                r2 = [r for r in self.rs if w % r == 0][0]
+                # reduction_constraint
+                r1 = min(self.max_r1, r1)
+                r2 = min(self.max_r2, r2)
+                s = x[:, :, ::r1, ::r2].cumsum(dim=-1).cumsum(dim=-2)
+                n, c, h, w = s.shape
+                k1, k2 = min(h - 1, self.kernel_size[0] // r1), min(w - 1, self.kernel_size[1] // r2)
+                out = (s[:, :, :-k1, :-k2] - s[:, :, :-k1, k2:] - s[:, :, k1:, :-k2] + s[:, :, k1:, k2:]) / (k1 * k2)
+                out = torch.nn.functional.interpolate(out, scale_factor=(r1, r2))
+        else:
+            n, c, h, w = x.shape
+            s = x.cumsum(dim=-1).cumsum_(dim=-2)
+            s = torch.nn.functional.pad(s, (1, 0, 1, 0))  # pad 0 for convenience
+            k1, k2 = min(h, self.kernel_size[0]), min(w, self.kernel_size[1])
+            s1, s2, s3, s4 = s[:, :, :-k1, :-k2], s[:, :, :-k1, k2:], s[:, :, k1:, :-k2], s[:, :, k1:, k2:]
+            out = s4 + s1 - s2 - s3
+            out = out / (k1 * k2)
+        if self.auto_pad:
+            n, c, h, w = x.shape
+            _h, _w = out.shape[2:]
+            # print(x.shape, self.kernel_size)
+            pad2d = ((w - _w) // 2, (w - _w + 1) // 2, (h - _h) // 2, (h - _h + 1) // 2)
+            out = torch.nn.functional.pad(out, pad2d, mode='replicate')
+        return out
+def replace_layers(model, base_size, train_size, fast_imp, **kwargs):
+    for n, m in model.named_children():
+        if len(list(m.children())) > 0:
+            ## compound module, go inside it
+            replace_layers(m, base_size, train_size, fast_imp, **kwargs)
+        if isinstance(m, nn.AdaptiveAvgPool2d):
+            pool = AvgPool2d(base_size=base_size, fast_imp=fast_imp, train_size=train_size)
+            assert m.output_size == 1
+            setattr(model, n, pool)
+'''
+ref.
+@article{chu2021tlsc,
+  title={Revisiting Global Statistics Aggregation for Improving Image Restoration},
+  author={Chu, Xiaojie and Chen, Liangyu and and Chen, Chengpeng and Lu, Xin},
+  journal={arXiv preprint arXiv:2112.04491},
+  year={2021}
+}
+'''
+class Local_Base():
+    def convert(self, *args, train_size, **kwargs):
+        replace_layers(self, *args, train_size=train_size, **kwargs)
+        imgs = torch.rand(train_size)
+        with torch.no_grad():
+            self.forward(imgs)

insir_models/__pycache__/instructir.cpython-39.pyc ADDED Viewed

Binary file (4.22 kB). View file

insir_models/__pycache__/nafnet.cpython-39.pyc ADDED Viewed

Binary file (5.53 kB). View file

insir_models/__pycache__/nafnet_utils.cpython-39.pyc ADDED Viewed

Binary file (5.4 kB). View file

insir_models/instructir.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import init as init
+from torch.nn.modules.batchnorm import _BatchNorm
+from insir_models.nafnet_utils import Local_Base, LayerNorm2d
+from insir_models.nafnet import SimpleGate, NAFBlock
+class ICB(nn.Module):
+    """
+    Instruction Condition Block (ICB)
+    Paper Section 3.3
+    """
+    def __init__(self, feature_dim, text_dim=768):
+        super(ICB, self).__init__()
+        self.fc    = nn.Linear(text_dim, feature_dim)
+        self.block = NAFBlock(feature_dim)
+        self.beta  = nn.Parameter(torch.zeros((1, feature_dim, 1, 1)), requires_grad=True)
+        self.gamma = nn.Parameter(torch.zeros((1, feature_dim, 1, 1)), requires_grad=True)
+    def forward(self, x, text_embedding):
+        gating_factors = torch.sigmoid(self.fc(text_embedding))
+        gating_factors = gating_factors.unsqueeze(-1).unsqueeze(-1)
+        f = x * self.gamma + self.beta  # 1) learned feature scaling/modulation
+        f = f * gating_factors          # 2) (soft) feature routing based on text
+        f = self.block(f)               # 3) block feature enhancement
+        return f + x
+class InstructIR(nn.Module):
+    """
+    InstructIR model using NAFNet (ECCV 2022) as backbone.
+    The model takes as input an RGB image and a text embedding (encoded instruction).
+    Described in Paper Section 3.3
+    """
+    def __init__(self, img_channel=3, width=16, middle_blk_num=1, enc_blk_nums=[], dec_blk_nums=[], txtdim=768):
+        super().__init__()
+        self.intro  = nn.Conv2d(in_channels=img_channel, out_channels=width, kernel_size=3, padding=1, stride=1, groups=1,
+                              bias=True)
+        self.ending = nn.Conv2d(in_channels=width, out_channels=img_channel, kernel_size=3, padding=1, stride=1, groups=1,
+                              bias=True)
+        self.encoders    = nn.ModuleList()
+        self.decoders    = nn.ModuleList()
+        self.middle_blks = nn.ModuleList()
+        self.ups         = nn.ModuleList()
+        self.downs       = nn.ModuleList()
+        self.enc_cond    = nn.ModuleList()
+        self.dec_cond    = nn.ModuleList()
+        chan = width
+        for num in enc_blk_nums:
+            self.encoders.append(
+                nn.Sequential(
+                    *[NAFBlock(chan) for _ in range(num)]
+                )
+            )
+            self.enc_cond.append(ICB(chan, txtdim))
+            self.downs.append(
+                nn.Conv2d(chan, 2*chan, 2, 2)
+            )
+            chan = chan * 2
+        self.middle_blks = nn.Sequential(
+                *[NAFBlock(chan) for _ in range(middle_blk_num)]
+            )
+        for num in dec_blk_nums:
+            self.ups.append(
+                nn.Sequential(
+                    nn.Conv2d(chan, chan * 2, 1, bias=False),
+                    nn.PixelShuffle(2)
+                )
+            )
+            chan = chan // 2
+            self.decoders.append(
+                nn.Sequential(
+                    *[NAFBlock(chan) for _ in range(num)]
+                )
+            )
+            # Add text embedding as modulation
+            self.dec_cond.append(ICB(chan, txtdim))
+        self.padder_size = 2 ** len(self.encoders)
+    def forward(self, inp, txtembd):
+        B, C, H, W = inp.shape
+        inp = self.check_image_size(inp)
+        x = self.intro(inp)
+        encs = []
+        for encoder, enc_mod, down in zip(self.encoders, self.enc_cond, self.downs):
+            x = encoder(x)
+            x = enc_mod(x, txtembd)
+            encs.append(x)
+            x = down(x)
+        x = self.middle_blks(x)
+        for decoder, up, enc_skip, dec_mod in zip(self.decoders, self.ups, encs[::-1], self.dec_cond):
+            x = up(x)
+            x = x + enc_skip
+            x = decoder(x)
+            x = dec_mod(x, txtembd)
+        x = self.ending(x)
+        x = x + inp
+        return x[:, :, :H, :W]
+    def check_image_size(self, x):
+        _, _, h, w = x.size()
+        mod_pad_h = (self.padder_size - h % self.padder_size) % self.padder_size
+        mod_pad_w = (self.padder_size - w % self.padder_size) % self.padder_size
+        x = F.pad(x, (0, mod_pad_w, 0, mod_pad_h))
+        return x
+def create_model(input_channels = 3, width = 32, enc_blks = [2, 2, 4, 8], middle_blk_num = 12, dec_blks = [2, 2, 2, 2], txtdim=768):
+    net = InstructIR(img_channel=input_channels, width=width, middle_blk_num=middle_blk_num,
+                      enc_blk_nums=enc_blks, dec_blk_nums=dec_blks, txtdim=txtdim)
+    return net

insir_models/nafnet.py ADDED Viewed

	@@ -0,0 +1,201 @@

+# ------------------------------------------------------------------------
+# Copyright (c) 2022 megvii-model. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Source: https://github.com/megvii-research/NAFNet
+'''
+Simple Baselines for Image Restoration
+@article{chen2022simple,
+  title={Simple Baselines for Image Restoration},
+  author={Chen, Liangyu and Chu, Xiaojie and Zhang, Xiangyu and Sun, Jian},
+  journal={arXiv preprint arXiv:2204.04676},
+  year={2022}
+}
+'''
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import init as init
+from torch.nn.modules.batchnorm import _BatchNorm
+from insir_models.nafnet_utils import Local_Base, LayerNorm2d
+class SimpleGate(nn.Module):
+    def forward(self, x):
+        x1, x2 = x.chunk(2, dim=1)
+        return x1 * x2
+class NAFBlock(nn.Module):
+    def __init__(self, c, DW_Expand=2, FFN_Expand=2, drop_out_rate=0.):
+        super().__init__()
+        dw_channel = c * DW_Expand
+        self.conv1 = nn.Conv2d(in_channels=c, out_channels=dw_channel, kernel_size=1, padding=0, stride=1, groups=1, bias=True)
+        self.conv2 = nn.Conv2d(in_channels=dw_channel, out_channels=dw_channel, kernel_size=3, padding=1, stride=1, groups=dw_channel,
+                               bias=True)
+        self.conv3 = nn.Conv2d(in_channels=dw_channel // 2, out_channels=c, kernel_size=1, padding=0, stride=1, groups=1, bias=True)
+        # Simplified Channel Attention
+        self.sca = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(in_channels=dw_channel // 2, out_channels=dw_channel // 2, kernel_size=1, padding=0, stride=1,
+                      groups=1, bias=True),
+        )
+        # SimpleGate
+        self.sg = SimpleGate()
+        ffn_channel = FFN_Expand * c
+        self.conv4 = nn.Conv2d(in_channels=c, out_channels=ffn_channel, kernel_size=1, padding=0, stride=1, groups=1, bias=True)
+        self.conv5 = nn.Conv2d(in_channels=ffn_channel // 2, out_channels=c, kernel_size=1, padding=0, stride=1, groups=1, bias=True)
+        self.norm1 = LayerNorm2d(c)
+        self.norm2 = LayerNorm2d(c)
+        self.dropout1 = nn.Dropout(drop_out_rate) if drop_out_rate > 0. else nn.Identity()
+        self.dropout2 = nn.Dropout(drop_out_rate) if drop_out_rate > 0. else nn.Identity()
+        self.beta = nn.Parameter(torch.zeros((1, c, 1, 1)), requires_grad=True)
+        self.gamma = nn.Parameter(torch.zeros((1, c, 1, 1)), requires_grad=True)
+    def forward(self, inp):
+        x = inp
+        x = self.norm1(x)
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.sg(x)
+        x = x * self.sca(x)
+        x = self.conv3(x)
+        x = self.dropout1(x)
+        y = inp + x * self.beta
+        x = self.conv4(self.norm2(y))
+        x = self.sg(x)
+        x = self.conv5(x)
+        x = self.dropout2(x)
+        return y + x * self.gamma
+class NAFNet(nn.Module):
+    def __init__(self, img_channel=3, width=16, middle_blk_num=1, enc_blk_nums=[], dec_blk_nums=[]):
+        super().__init__()
+        self.intro = nn.Conv2d(in_channels=img_channel, out_channels=width, kernel_size=3, padding=1, stride=1, groups=1,
+                              bias=True)
+        self.ending = nn.Conv2d(in_channels=width, out_channels=img_channel, kernel_size=3, padding=1, stride=1, groups=1,
+                              bias=True)
+        self.encoders = nn.ModuleList()
+        self.decoders = nn.ModuleList()
+        self.middle_blks = nn.ModuleList()
+        self.ups = nn.ModuleList()
+        self.downs = nn.ModuleList()
+        chan = width
+        for num in enc_blk_nums:
+            self.encoders.append(
+                nn.Sequential(
+                    *[NAFBlock(chan) for _ in range(num)]
+                )
+            )
+            self.downs.append(
+                nn.Conv2d(chan, 2*chan, 2, 2)
+            )
+            chan = chan * 2
+        self.middle_blks = \
+            nn.Sequential(
+                *[NAFBlock(chan) for _ in range(middle_blk_num)]
+            )
+        for num in dec_blk_nums:
+            self.ups.append(
+                nn.Sequential(
+                    nn.Conv2d(chan, chan * 2, 1, bias=False),
+                    nn.PixelShuffle(2)
+                )
+            )
+            chan = chan // 2
+            self.decoders.append(
+                nn.Sequential(
+                    *[NAFBlock(chan) for _ in range(num)]
+                )
+            )
+        self.padder_size = 2 ** len(self.encoders)
+    def forward(self, inp):
+        B, C, H, W = inp.shape
+        inp = self.check_image_size(inp)
+        x = self.intro(inp)
+        encs = []
+        for encoder, down in zip(self.encoders, self.downs):
+            x = encoder(x)
+            encs.append(x)
+            x = down(x)
+        x = self.middle_blks(x)
+        for decoder, up, enc_skip in zip(self.decoders, self.ups, encs[::-1]):
+            x = up(x)
+            x = x + enc_skip
+            x = decoder(x)
+        x = self.ending(x)
+        x = x + inp
+        return x[:, :, :H, :W]
+    def check_image_size(self, x):
+        _, _, h, w = x.size()
+        mod_pad_h = (self.padder_size - h % self.padder_size) % self.padder_size
+        mod_pad_w = (self.padder_size - w % self.padder_size) % self.padder_size
+        x = F.pad(x, (0, mod_pad_w, 0, mod_pad_h))
+        return x
+class NAFNetLocal(Local_Base, NAFNet):
+    def __init__(self, *args, train_size=(1, 3, 256, 256), fast_imp=False, **kwargs):
+        Local_Base.__init__(self)
+        NAFNet.__init__(self, *args, **kwargs)
+        N, C, H, W = train_size
+        base_size = (int(H * 1.5), int(W * 1.5))
+        self.eval()
+        with torch.no_grad():
+            self.convert(base_size=base_size, train_size=train_size, fast_imp=fast_imp)
+def create_nafnet(input_channels = 3, width = 32, enc_blks = [2, 2, 4, 8], middle_blk_num = 12, dec_blks = [2, 2, 2, 2]):
+    """
+    Create Nafnet model
+    https://github.com/megvii-research/NAFNet/blob/main/options/test/SIDD/NAFNet-width32.yml
+    """
+    net = NAFNet(img_channel=input_channels, width=width, middle_blk_num=middle_blk_num,
+                      enc_blk_nums=enc_blks, dec_blk_nums=dec_blks)
+    # inp_shape = (3, 256, 256)
+    # from ptflops import get_model_complexity_info
+    # macs, params = get_model_complexity_info(net, inp_shape, verbose=False, print_per_layer_stat=False)
+    # params = float(params[:-3])
+    # macs = float(macs[:-4])
+    # print(macs, params)
+    return net

insir_models/nafnet_utils.py ADDED Viewed

	@@ -0,0 +1,146 @@

+# ------------------------------------------------------------------------
+# Copyright (c) 2022 megvii-model. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Source: https://github.com/megvii-research/NAFNet
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+class LayerNormFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, weight, bias, eps):
+        ctx.eps = eps
+        N, C, H, W = x.size()
+        mu = x.mean(1, keepdim=True)
+        var = (x - mu).pow(2).mean(1, keepdim=True)
+        y = (x - mu) / (var + eps).sqrt()
+        ctx.save_for_backward(y, var, weight)
+        y = weight.view(1, C, 1, 1) * y + bias.view(1, C, 1, 1)
+        return y
+    @staticmethod
+    def backward(ctx, grad_output):
+        eps = ctx.eps
+        N, C, H, W = grad_output.size()
+        y, var, weight = ctx.saved_variables
+        g = grad_output * weight.view(1, C, 1, 1)
+        mean_g = g.mean(dim=1, keepdim=True)
+        mean_gy = (g * y).mean(dim=1, keepdim=True)
+        gx = 1. / torch.sqrt(var + eps) * (g - y * mean_gy - mean_g)
+        return gx, (grad_output * y).sum(dim=3).sum(dim=2).sum(dim=0), grad_output.sum(dim=3).sum(dim=2).sum(
+            dim=0), None
+class LayerNorm2d(nn.Module):
+    def __init__(self, channels, eps=1e-6):
+        super(LayerNorm2d, self).__init__()
+        self.register_parameter('weight', nn.Parameter(torch.ones(channels)))
+        self.register_parameter('bias', nn.Parameter(torch.zeros(channels)))
+        self.eps = eps
+    def forward(self, x):
+        return LayerNormFunction.apply(x, self.weight, self.bias, self.eps)
+class AvgPool2d(nn.Module):
+    def __init__(self, kernel_size=None, base_size=None, auto_pad=True, fast_imp=False, train_size=None):
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.base_size = base_size
+        self.auto_pad = auto_pad
+        # only used for fast implementation
+        self.fast_imp = fast_imp
+        self.rs = [5, 4, 3, 2, 1]
+        self.max_r1 = self.rs[0]
+        self.max_r2 = self.rs[0]
+        self.train_size = train_size
+    def extra_repr(self) -> str:
+        return 'kernel_size={}, base_size={}, stride={}, fast_imp={}'.format(
+            self.kernel_size, self.base_size, self.kernel_size, self.fast_imp
+        )
+    def forward(self, x):
+        if self.kernel_size is None and self.base_size:
+            train_size = self.train_size
+            if isinstance(self.base_size, int):
+                self.base_size = (self.base_size, self.base_size)
+            self.kernel_size = list(self.base_size)
+            self.kernel_size[0] = x.shape[2] * self.base_size[0] // train_size[-2]
+            self.kernel_size[1] = x.shape[3] * self.base_size[1] // train_size[-1]
+            # only used for fast implementation
+            self.max_r1 = max(1, self.rs[0] * x.shape[2] // train_size[-2])
+            self.max_r2 = max(1, self.rs[0] * x.shape[3] // train_size[-1])
+        if self.kernel_size[0] >= x.size(-2) and self.kernel_size[1] >= x.size(-1):
+            return F.adaptive_avg_pool2d(x, 1)
+        if self.fast_imp:  # Non-equivalent implementation but faster
+            h, w = x.shape[2:]
+            if self.kernel_size[0] >= h and self.kernel_size[1] >= w:
+                out = F.adaptive_avg_pool2d(x, 1)
+            else:
+                r1 = [r for r in self.rs if h % r == 0][0]
+                r2 = [r for r in self.rs if w % r == 0][0]
+                # reduction_constraint
+                r1 = min(self.max_r1, r1)
+                r2 = min(self.max_r2, r2)
+                s = x[:, :, ::r1, ::r2].cumsum(dim=-1).cumsum(dim=-2)
+                n, c, h, w = s.shape
+                k1, k2 = min(h - 1, self.kernel_size[0] // r1), min(w - 1, self.kernel_size[1] // r2)
+                out = (s[:, :, :-k1, :-k2] - s[:, :, :-k1, k2:] - s[:, :, k1:, :-k2] + s[:, :, k1:, k2:]) / (k1 * k2)
+                out = torch.nn.functional.interpolate(out, scale_factor=(r1, r2))
+        else:
+            n, c, h, w = x.shape
+            s = x.cumsum(dim=-1).cumsum_(dim=-2)
+            s = torch.nn.functional.pad(s, (1, 0, 1, 0))  # pad 0 for convenience
+            k1, k2 = min(h, self.kernel_size[0]), min(w, self.kernel_size[1])
+            s1, s2, s3, s4 = s[:, :, :-k1, :-k2], s[:, :, :-k1, k2:], s[:, :, k1:, :-k2], s[:, :, k1:, k2:]
+            out = s4 + s1 - s2 - s3
+            out = out / (k1 * k2)
+        if self.auto_pad:
+            n, c, h, w = x.shape
+            _h, _w = out.shape[2:]
+            # print(x.shape, self.kernel_size)
+            pad2d = ((w - _w) // 2, (w - _w + 1) // 2, (h - _h) // 2, (h - _h + 1) // 2)
+            out = torch.nn.functional.pad(out, pad2d, mode='replicate')
+        return out
+def replace_layers(model, base_size, train_size, fast_imp, **kwargs):
+    for n, m in model.named_children():
+        if len(list(m.children())) > 0:
+            ## compound module, go inside it
+            replace_layers(m, base_size, train_size, fast_imp, **kwargs)
+        if isinstance(m, nn.AdaptiveAvgPool2d):
+            pool = AvgPool2d(base_size=base_size, fast_imp=fast_imp, train_size=train_size)
+            assert m.output_size == 1
+            setattr(model, n, pool)
+'''
+ref.
+@article{chu2021tlsc,
+  title={Revisiting Global Statistics Aggregation for Improving Image Restoration},
+  author={Chu, Xiaojie and Chen, Liangyu and and Chen, Chengpeng and Lu, Xin},
+  journal={arXiv preprint arXiv:2112.04491},
+  year={2021}
+}
+'''
+class Local_Base():
+    def convert(self, *args, train_size, **kwargs):
+        replace_layers(self, *args, train_size=train_size, **kwargs)
+        imgs = torch.rand(train_size)
+        with torch.no_grad():
+            self.forward(imgs)

insir_text/.ipynb_checkpoints/models-checkpoint.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+from torch import nn
+import torch.nn.functional as F
+from transformers import DistilBertModel, DistilBertTokenizer, AutoModel, AutoTokenizer
+import os
+# Models that use mean pooling
+POOL_MODELS = {"sentence-transformers/all-MiniLM-L6-v2", "TaylorAI/bge-micro-v2"}
+#Mean Pooling - Take attention mask into account for correct averaging
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+class LanguageModel(nn.Module):
+    def __init__(self, model='distilbert-base-uncased'):
+        super(LanguageModel, self).__init__()
+        self.tokenizer = AutoTokenizer.from_pretrained(model)
+        self.model = AutoModel.from_pretrained(model)
+        self.model_name = model
+        # Remove the CLIP vision tower
+        if "clip" in self.model_name:
+            self.model.vision_model = None
+        # Freeze the pre-trained parameters (very important)
+        for param in self.model.parameters():
+            param.requires_grad = False
+        # Make sure to set evaluation mode (also important)
+        self.model.eval()
+    def forward(self, text_batch):
+        inputs = self.tokenizer(text_batch, padding=True, truncation=True, return_tensors="pt")
+        with torch.no_grad(): # Ensure no gradients are computed for this forward pass
+            if "clip" in self.model_name:
+                sentence_embedding = self.model.get_text_features(**inputs)
+                return sentence_embedding
+            outputs = self.model(**inputs)
+        if any(model in self.model_name for model in POOL_MODELS):
+            sentence_embeddings = mean_pooling(outputs, inputs['attention_mask'])
+            # Normalize embeddings
+            sentence_embedding = F.normalize(sentence_embeddings, p=2, dim=1)
+        else:
+            sentence_embedding = outputs.last_hidden_state[:, 0, :]
+        return sentence_embedding
+class LMHead(nn.Module):
+    def __init__(self, embedding_dim=384, hidden_dim=256, num_classes=4):
+        super(LMHead, self).__init__()
+        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
+        #self.gelu = nn.GELU()
+        self.fc2 = nn.Linear(hidden_dim, num_classes)
+    def forward(self, x):
+        embd = self.fc1(x)
+        embd = F.normalize(embd, p=2, dim=1)
+        deg_pred = self.fc2(embd)
+        return embd, deg_pred

insir_text/__pycache__/models.cpython-39.pyc ADDED Viewed

Binary file (2.72 kB). View file

insir_text/models.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+from torch import nn
+import torch.nn.functional as F
+from transformers import DistilBertModel, DistilBertTokenizer, AutoModel, AutoTokenizer
+import os
+# Models that use mean pooling
+POOL_MODELS = {"sentence-transformers/all-MiniLM-L6-v2", "TaylorAI/bge-micro-v2"}
+#Mean Pooling - Take attention mask into account for correct averaging
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+class LanguageModel(nn.Module):
+    def __init__(self, model='distilbert-base-uncased'):
+        super(LanguageModel, self).__init__()
+        self.tokenizer = AutoTokenizer.from_pretrained(model)
+        self.model = AutoModel.from_pretrained(model)
+        self.model_name = model
+        # Remove the CLIP vision tower
+        if "clip" in self.model_name:
+            self.model.vision_model = None
+        # Freeze the pre-trained parameters (very important)
+        for param in self.model.parameters():
+            param.requires_grad = False
+        # Make sure to set evaluation mode (also important)
+        self.model.eval()
+    def forward(self, text_batch):
+        inputs = self.tokenizer(text_batch, padding=True, truncation=True, return_tensors="pt")
+        with torch.no_grad(): # Ensure no gradients are computed for this forward pass
+            if "clip" in self.model_name:
+                sentence_embedding = self.model.get_text_features(**inputs)
+                return sentence_embedding
+            outputs = self.model(**inputs)
+        if any(model in self.model_name for model in POOL_MODELS):
+            sentence_embeddings = mean_pooling(outputs, inputs['attention_mask'])
+            # Normalize embeddings
+            sentence_embedding = F.normalize(sentence_embeddings, p=2, dim=1)
+        else:
+            sentence_embedding = outputs.last_hidden_state[:, 0, :]
+        return sentence_embedding
+class LMHead(nn.Module):
+    def __init__(self, embedding_dim=384, hidden_dim=256, num_classes=4):
+        super(LMHead, self).__init__()
+        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
+        #self.gelu = nn.GELU()
+        self.fc2 = nn.Linear(hidden_dim, num_classes)
+    def forward(self, x):
+        embd = self.fc1(x)
+        embd = F.normalize(embd, p=2, dim=1)
+        deg_pred = self.fc2(embd)
+        return embd, deg_pred

insir_text/sample_prompts.json ADDED Viewed

	@@ -0,0 +1,55 @@

+{
+  "denoising": [
+    "Help me reduce the fuzziness in this image.",
+    "I need this image denoised ASAP.",
+    "Clean up this noisy image, it's an eyesore.",
+    "Can you clean the dots from my image?",
+    "Help me with my picture, it's full of tiny spots.",
+    "Clean up this image, it's all grainy."
+  ],
+  "deblurring": [
+    "Please, clean up this blurry photo.",
+    "My picture's not sharp, fix it.",
+    "Deblur my picture, it's too fuzzy.",
+    "Help, my photo is too blurry.",
+    "Please, make my image less smudgy."
+  ],
+  "dehazing": [
+    "Please, fix the haziness in my image.",
+    "I need to remove the haziness from this image.",
+    "Get rid of the fog in my image.",
+    "Fix my photo, it's too misty.",
+    "Help me, my photo is all hazy."
+  ],
+  "deraining": [
+    "I want to eliminate the water from this image.",
+    "Clear the rain from my picture.",
+    "I need to clear the rain from this image.",
+    "Can you get rid of the raindrops in my picture?"
+  ],
+  "sr": [
+    "I need to enhance the size and quality of this image.",
+    "My photo is lacking size and clarity; can you improve it?",
+    "I'd appreciate it if you could upscale this photo.",
+    "My picture is too little, enlarge it."
+  ],
+  "ambiguous": [
+    "Please, clear up the mess on this image.",
+    "I want this image to look good.",
+    "make it pop",
+    "Fix my photo, it's all messed up."
+  ],
+  "lol": [
+    "I took this photo during night, enhance it",
+    "The photo is too dark, improve exposure",
+    "my image has poor lighting conditions, can you fix it?",
+    "Can you make the image brighter?"
+  ],
+  "enhancement": [
+    "make my image look like DSLR",
+    "improve the colors of my image",
+    "enhance the colors of the image",
+    "Can you edit this to look like an award-winning photo?",
+    "I want the picture to be retouched for a professional portfolio."
+  ]
+}