SUPIR

Running on Zero

App Files Files Community

Fabrice-TIERCELIN commited on May 20

Commit

a6c349f

•

1 Parent(s): 2df5266

Upload 5 files

Browse files

Files changed (5) hide show

SUPIR/utils/colorfix.py +120 -0
SUPIR/utils/devices.py +138 -0
SUPIR/utils/face_restoration_helper.py +514 -0
SUPIR/utils/file.py +79 -0
SUPIR/utils/tilevae.py +971 -0

SUPIR/utils/colorfix.py ADDED Viewed

	@@ -0,0 +1,120 @@

+'''
+# --------------------------------------------------------------------------------
+#   Color fixed script from Li Yi (https://github.com/pkuliyi2015/sd-webui-stablesr/blob/master/srmodule/colorfix.py)
+# --------------------------------------------------------------------------------
+'''
+import torch
+from PIL import Image
+from torch import Tensor
+from torch.nn import functional as F
+from torchvision.transforms import ToTensor, ToPILImage
+def adain_color_fix(target: Image, source: Image):
+    # Convert images to tensors
+    to_tensor = ToTensor()
+    target_tensor = to_tensor(target).unsqueeze(0)
+    source_tensor = to_tensor(source).unsqueeze(0)
+    # Apply adaptive instance normalization
+    result_tensor = adaptive_instance_normalization(target_tensor, source_tensor)
+    # Convert tensor back to image
+    to_image = ToPILImage()
+    result_image = to_image(result_tensor.squeeze(0).clamp_(0.0, 1.0))
+    return result_image
+def wavelet_color_fix(target: Image, source: Image):
+    # Convert images to tensors
+    to_tensor = ToTensor()
+    target_tensor = to_tensor(target).unsqueeze(0)
+    source_tensor = to_tensor(source).unsqueeze(0)
+    # Apply wavelet reconstruction
+    result_tensor = wavelet_reconstruction(target_tensor, source_tensor)
+    # Convert tensor back to image
+    to_image = ToPILImage()
+    result_image = to_image(result_tensor.squeeze(0).clamp_(0.0, 1.0))
+    return result_image
+def calc_mean_std(feat: Tensor, eps=1e-5):
+    """Calculate mean and std for adaptive_instance_normalization.
+    Args:
+        feat (Tensor): 4D tensor.
+        eps (float): A small value added to the variance to avoid
+            divide-by-zero. Default: 1e-5.
+    """
+    size = feat.size()
+    assert len(size) == 4, 'The input feature should be 4D tensor.'
+    b, c = size[:2]
+    feat_var = feat.reshape(b, c, -1).var(dim=2) + eps
+    feat_std = feat_var.sqrt().reshape(b, c, 1, 1)
+    feat_mean = feat.reshape(b, c, -1).mean(dim=2).reshape(b, c, 1, 1)
+    return feat_mean, feat_std
+def adaptive_instance_normalization(content_feat:Tensor, style_feat:Tensor):
+    """Adaptive instance normalization.
+    Adjust the reference features to have the similar color and illuminations
+    as those in the degradate features.
+    Args:
+        content_feat (Tensor): The reference feature.
+        style_feat (Tensor): The degradate features.
+    """
+    size = content_feat.size()
+    style_mean, style_std = calc_mean_std(style_feat)
+    content_mean, content_std = calc_mean_std(content_feat)
+    normalized_feat = (content_feat - content_mean.expand(size)) / content_std.expand(size)
+    return normalized_feat * style_std.expand(size) + style_mean.expand(size)
+def wavelet_blur(image: Tensor, radius: int):
+    """
+    Apply wavelet blur to the input tensor.
+    """
+    # input shape: (1, 3, H, W)
+    # convolution kernel
+    kernel_vals = [
+        [0.0625, 0.125, 0.0625],
+        [0.125, 0.25, 0.125],
+        [0.0625, 0.125, 0.0625],
+    ]
+    kernel = torch.tensor(kernel_vals, dtype=image.dtype, device=image.device)
+    # add channel dimensions to the kernel to make it a 4D tensor
+    kernel = kernel[None, None]
+    # repeat the kernel across all input channels
+    kernel = kernel.repeat(3, 1, 1, 1)
+    image = F.pad(image, (radius, radius, radius, radius), mode='replicate')
+    # apply convolution
+    output = F.conv2d(image, kernel, groups=3, dilation=radius)
+    return output
+def wavelet_decomposition(image: Tensor, levels=5):
+    """
+    Apply wavelet decomposition to the input tensor.
+    This function only returns the low frequency & the high frequency.
+    """
+    high_freq = torch.zeros_like(image)
+    for i in range(levels):
+        radius = 2 ** i
+        low_freq = wavelet_blur(image, radius)
+        high_freq += (image - low_freq)
+        image = low_freq
+    return high_freq, low_freq
+def wavelet_reconstruction(content_feat:Tensor, style_feat:Tensor):
+    """
+    Apply wavelet decomposition, so that the content will have the same color as the style.
+    """
+    # calculate the wavelet decomposition of the content feature
+    content_high_freq, content_low_freq = wavelet_decomposition(content_feat)
+    del content_low_freq
+    # calculate the wavelet decomposition of the style feature
+    style_high_freq, style_low_freq = wavelet_decomposition(style_feat)
+    del style_high_freq
+    # reconstruct the content feature with the style's high frequency
+    return content_high_freq + style_low_freq

SUPIR/utils/devices.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import sys
+import contextlib
+from functools import lru_cache
+import torch
+#from modules import errors
+if sys.platform == "darwin":
+    from modules import mac_specific
+def has_mps() -> bool:
+    if sys.platform != "darwin":
+        return False
+    else:
+        return mac_specific.has_mps
+def get_cuda_device_string():
+    return "cuda"
+def get_optimal_device_name():
+    if torch.cuda.is_available():
+        return get_cuda_device_string()
+    if has_mps():
+        return "mps"
+    return "cpu"
+def get_optimal_device():
+    return torch.device(get_optimal_device_name())
+def get_device_for(task):
+    return get_optimal_device()
+def torch_gc():
+    if torch.cuda.is_available():
+        with torch.cuda.device(get_cuda_device_string()):
+            torch.cuda.empty_cache()
+            torch.cuda.ipc_collect()
+    if has_mps():
+        mac_specific.torch_mps_gc()
+def enable_tf32():
+    if torch.cuda.is_available():
+        # enabling benchmark option seems to enable a range of cards to do fp16 when they otherwise can't
+        # see https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/4407
+        if any(torch.cuda.get_device_capability(devid) == (7, 5) for devid in range(0, torch.cuda.device_count())):
+            torch.backends.cudnn.benchmark = True
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32 = True
+enable_tf32()
+#errors.run(enable_tf32, "Enabling TF32")
+cpu = torch.device("cpu")
+device = device_interrogate = device_gfpgan = device_esrgan = device_codeformer = torch.device("cuda")
+dtype = torch.float16
+dtype_vae = torch.float16
+dtype_unet = torch.float16
+unet_needs_upcast = False
+def cond_cast_unet(input):
+    return input.to(dtype_unet) if unet_needs_upcast else input
+def cond_cast_float(input):
+    return input.float() if unet_needs_upcast else input
+def randn(seed, shape):
+    torch.manual_seed(seed)
+    return torch.randn(shape, device=device)
+def randn_without_seed(shape):
+    return torch.randn(shape, device=device)
+def autocast(disable=False):
+    if disable:
+        return contextlib.nullcontext()
+    return torch.autocast("cuda")
+def without_autocast(disable=False):
+    return torch.autocast("cuda", enabled=False) if torch.is_autocast_enabled() and not disable else contextlib.nullcontext()
+class NansException(Exception):
+    pass
+def test_for_nans(x, where):
+    if not torch.all(torch.isnan(x)).item():
+        return
+    if where == "unet":
+        message = "A tensor with all NaNs was produced in Unet."
+    elif where == "vae":
+        message = "A tensor with all NaNs was produced in VAE."
+    else:
+        message = "A tensor with all NaNs was produced."
+    message += " Use --disable-nan-check commandline argument to disable this check."
+    raise NansException(message)
+@lru_cache
+def first_time_calculation():
+    """
+    just do any calculation with pytorch layers - the first time this is done it allocaltes about 700MB of memory and
+    spends about 2.7 seconds doing that, at least wih NVidia.
+    """
+    x = torch.zeros((1, 1)).to(device, dtype)
+    linear = torch.nn.Linear(1, 1).to(device, dtype)
+    linear(x)
+    x = torch.zeros((1, 1, 3, 3)).to(device, dtype)
+    conv2d = torch.nn.Conv2d(1, 1, (3, 3)).to(device, dtype)
+    conv2d(x)

SUPIR/utils/face_restoration_helper.py ADDED Viewed

	@@ -0,0 +1,514 @@

+import cv2
+import numpy as np
+import os
+import torch
+from torchvision.transforms.functional import normalize
+from facexlib.detection import init_detection_model
+from facexlib.parsing import init_parsing_model
+from facexlib.utils.misc import img2tensor, imwrite
+from .file import load_file_from_url
+def get_largest_face(det_faces, h, w):
+    def get_location(val, length):
+        if val < 0:
+            return 0
+        elif val > length:
+            return length
+        else:
+            return val
+    face_areas = []
+    for det_face in det_faces:
+        left = get_location(det_face[0], w)
+        right = get_location(det_face[2], w)
+        top = get_location(det_face[1], h)
+        bottom = get_location(det_face[3], h)
+        face_area = (right - left) * (bottom - top)
+        face_areas.append(face_area)
+    largest_idx = face_areas.index(max(face_areas))
+    return det_faces[largest_idx], largest_idx
+def get_center_face(det_faces, h=0, w=0, center=None):
+    if center is not None:
+        center = np.array(center)
+    else:
+        center = np.array([w / 2, h / 2])
+    center_dist = []
+    for det_face in det_faces:
+        face_center = np.array([(det_face[0] + det_face[2]) / 2, (det_face[1] + det_face[3]) / 2])
+        dist = np.linalg.norm(face_center - center)
+        center_dist.append(dist)
+    center_idx = center_dist.index(min(center_dist))
+    return det_faces[center_idx], center_idx
+class FaceRestoreHelper(object):
+    """Helper for the face restoration pipeline (base class)."""
+    def __init__(self,
+                 upscale_factor,
+                 face_size=512,
+                 crop_ratio=(1, 1),
+                 det_model='retinaface_resnet50',
+                 save_ext='png',
+                 template_3points=False,
+                 pad_blur=False,
+                 use_parse=False,
+                 device=None):
+        self.template_3points = template_3points  # improve robustness
+        self.upscale_factor = int(upscale_factor)
+        # the cropped face ratio based on the square face
+        self.crop_ratio = crop_ratio  # (h, w)
+        assert (self.crop_ratio[0] >= 1 and self.crop_ratio[1] >= 1), 'crop ration only supports >=1'
+        self.face_size = (int(face_size * self.crop_ratio[1]), int(face_size * self.crop_ratio[0]))
+        self.det_model = det_model
+        if self.det_model == 'dlib':
+            # standard 5 landmarks for FFHQ faces with 1024 x 1024
+            self.face_template = np.array([[686.77227723, 488.62376238], [586.77227723, 493.59405941],
+                                           [337.91089109, 488.38613861], [437.95049505, 493.51485149],
+                                           [513.58415842, 678.5049505]])
+            self.face_template = self.face_template / (1024 // face_size)
+        elif self.template_3points:
+            self.face_template = np.array([[192, 240], [319, 240], [257, 371]])
+        else:
+            # standard 5 landmarks for FFHQ faces with 512 x 512
+            # facexlib
+            self.face_template = np.array([[192.98138, 239.94708], [318.90277, 240.1936], [256.63416, 314.01935],
+                                           [201.26117, 371.41043], [313.08905, 371.15118]])
+            # dlib: left_eye: 36:41  right_eye: 42:47  nose: 30,32,33,34  left mouth corner: 48  right mouth corner: 54
+            # self.face_template = np.array([[193.65928, 242.98541], [318.32558, 243.06108], [255.67984, 328.82894],
+            #                                 [198.22603, 372.82502], [313.91018, 372.75659]])
+        self.face_template = self.face_template * (face_size / 512.0)
+        if self.crop_ratio[0] > 1:
+            self.face_template[:, 1] += face_size * (self.crop_ratio[0] - 1) / 2
+        if self.crop_ratio[1] > 1:
+            self.face_template[:, 0] += face_size * (self.crop_ratio[1] - 1) / 2
+        self.save_ext = save_ext
+        self.pad_blur = pad_blur
+        if self.pad_blur is True:
+            self.template_3points = False
+        self.all_landmarks_5 = []
+        self.det_faces = []
+        self.affine_matrices = []
+        self.inverse_affine_matrices = []
+        self.cropped_faces = []
+        self.restored_faces = []
+        self.pad_input_imgs = []
+        if device is None:
+            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+            # self.device = get_device()
+        else:
+            self.device = device
+        # init face detection model
+        self.face_detector = init_detection_model(det_model, half=False, device=self.device)
+        # init face parsing model
+        self.use_parse = use_parse
+        self.face_parse = init_parsing_model(model_name='parsenet', device=self.device)
+    def set_upscale_factor(self, upscale_factor):
+        self.upscale_factor = upscale_factor
+    def read_image(self, img):
+        """img can be image path or cv2 loaded image."""
+        # self.input_img is Numpy array, (h, w, c), BGR, uint8, [0, 255]
+        if isinstance(img, str):
+            img = cv2.imread(img)
+        if np.max(img) > 256:  # 16-bit image
+            img = img / 65535 * 255
+        if len(img.shape) == 2:  # gray image
+            img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+        elif img.shape[2] == 4:  # BGRA image with alpha channel
+            img = img[:, :, 0:3]
+        self.input_img = img
+        # self.is_gray = is_gray(img, threshold=10)
+        # if self.is_gray:
+        #     print('Grayscale input: True')
+        if min(self.input_img.shape[:2]) < 512:
+            f = 512.0 / min(self.input_img.shape[:2])
+            self.input_img = cv2.resize(self.input_img, (0, 0), fx=f, fy=f, interpolation=cv2.INTER_LINEAR)
+    def init_dlib(self, detection_path, landmark5_path):
+        """Initialize the dlib detectors and predictors."""
+        try:
+            import dlib
+        except ImportError:
+            print('Please install dlib by running:' 'conda install -c conda-forge dlib')
+        detection_path = load_file_from_url(url=detection_path, model_dir='weights/dlib', progress=True, file_name=None)
+        landmark5_path = load_file_from_url(url=landmark5_path, model_dir='weights/dlib', progress=True, file_name=None)
+        face_detector = dlib.cnn_face_detection_model_v1(detection_path)
+        shape_predictor_5 = dlib.shape_predictor(landmark5_path)
+        return face_detector, shape_predictor_5
+    def get_face_landmarks_5_dlib(self,
+                                  only_keep_largest=False,
+                                  scale=1):
+        det_faces = self.face_detector(self.input_img, scale)
+        if len(det_faces) == 0:
+            print('No face detected. Try to increase upsample_num_times.')
+            return 0
+        else:
+            if only_keep_largest:
+                print('Detect several faces and only keep the largest.')
+                face_areas = []
+                for i in range(len(det_faces)):
+                    face_area = (det_faces[i].rect.right() - det_faces[i].rect.left()) * (
+                            det_faces[i].rect.bottom() - det_faces[i].rect.top())
+                    face_areas.append(face_area)
+                largest_idx = face_areas.index(max(face_areas))
+                self.det_faces = [det_faces[largest_idx]]
+            else:
+                self.det_faces = det_faces
+        if len(self.det_faces) == 0:
+            return 0
+        for face in self.det_faces:
+            shape = self.shape_predictor_5(self.input_img, face.rect)
+            landmark = np.array([[part.x, part.y] for part in shape.parts()])
+            self.all_landmarks_5.append(landmark)
+        return len(self.all_landmarks_5)
+    def get_face_landmarks_5(self,
+                             only_keep_largest=False,
+                             only_center_face=False,
+                             resize=None,
+                             blur_ratio=0.01,
+                             eye_dist_threshold=None):
+        if self.det_model == 'dlib':
+            return self.get_face_landmarks_5_dlib(only_keep_largest)
+        if resize is None:
+            scale = 1
+            input_img = self.input_img
+        else:
+            h, w = self.input_img.shape[0:2]
+            scale = resize / min(h, w)
+            scale = max(1, scale)  # always scale up
+            h, w = int(h * scale), int(w * scale)
+            interp = cv2.INTER_AREA if scale < 1 else cv2.INTER_LINEAR
+            input_img = cv2.resize(self.input_img, (w, h), interpolation=interp)
+        with torch.no_grad():
+            bboxes = self.face_detector.detect_faces(input_img)
+        if bboxes is None or bboxes.shape[0] == 0:
+            return 0
+        else:
+            bboxes = bboxes / scale
+        for bbox in bboxes:
+            # remove faces with too small eye distance: side faces or too small faces
+            eye_dist = np.linalg.norm([bbox[6] - bbox[8], bbox[7] - bbox[9]])
+            if eye_dist_threshold is not None and (eye_dist < eye_dist_threshold):
+                continue
+            if self.template_3points:
+                landmark = np.array([[bbox[i], bbox[i + 1]] for i in range(5, 11, 2)])
+            else:
+                landmark = np.array([[bbox[i], bbox[i + 1]] for i in range(5, 15, 2)])
+            self.all_landmarks_5.append(landmark)
+            self.det_faces.append(bbox[0:5])
+        if len(self.det_faces) == 0:
+            return 0
+        if only_keep_largest:
+            h, w, _ = self.input_img.shape
+            self.det_faces, largest_idx = get_largest_face(self.det_faces, h, w)
+            self.all_landmarks_5 = [self.all_landmarks_5[largest_idx]]
+        elif only_center_face:
+            h, w, _ = self.input_img.shape
+            self.det_faces, center_idx = get_center_face(self.det_faces, h, w)
+            self.all_landmarks_5 = [self.all_landmarks_5[center_idx]]
+        # pad blurry images
+        if self.pad_blur:
+            self.pad_input_imgs = []
+            for landmarks in self.all_landmarks_5:
+                # get landmarks
+                eye_left = landmarks[0, :]
+                eye_right = landmarks[1, :]
+                eye_avg = (eye_left + eye_right) * 0.5
+                mouth_avg = (landmarks[3, :] + landmarks[4, :]) * 0.5
+                eye_to_eye = eye_right - eye_left
+                eye_to_mouth = mouth_avg - eye_avg
+                # Get the oriented crop rectangle
+                # x: half width of the oriented crop rectangle
+                x = eye_to_eye - np.flipud(eye_to_mouth) * [-1, 1]
+                #  - np.flipud(eye_to_mouth) * [-1, 1]: rotate 90 clockwise
+                # norm with the hypotenuse: get the direction
+                x /= np.hypot(*x)  # get the hypotenuse of a right triangle
+                rect_scale = 1.5
+                x *= max(np.hypot(*eye_to_eye) * 2.0 * rect_scale, np.hypot(*eye_to_mouth) * 1.8 * rect_scale)
+                # y: half height of the oriented crop rectangle
+                y = np.flipud(x) * [-1, 1]
+                # c: center
+                c = eye_avg + eye_to_mouth * 0.1
+                # quad: (left_top, left_bottom, right_bottom, right_top)
+                quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y])
+                # qsize: side length of the square
+                qsize = np.hypot(*x) * 2
+                border = max(int(np.rint(qsize * 0.1)), 3)
+                # get pad
+                # pad: (width_left, height_top, width_right, height_bottom)
+                pad = (int(np.floor(min(quad[:, 0]))), int(np.floor(min(quad[:, 1]))), int(np.ceil(max(quad[:, 0]))),
+                       int(np.ceil(max(quad[:, 1]))))
+                pad = [
+                    max(-pad[0] + border, 1),
+                    max(-pad[1] + border, 1),
+                    max(pad[2] - self.input_img.shape[0] + border, 1),
+                    max(pad[3] - self.input_img.shape[1] + border, 1)
+                ]
+                if max(pad) > 1:
+                    # pad image
+                    pad_img = np.pad(self.input_img, ((pad[1], pad[3]), (pad[0], pad[2]), (0, 0)), 'reflect')
+                    # modify landmark coords
+                    landmarks[:, 0] += pad[0]
+                    landmarks[:, 1] += pad[1]
+                    # blur pad images
+                    h, w, _ = pad_img.shape
+                    y, x, _ = np.ogrid[:h, :w, :1]
+                    mask = np.maximum(1.0 - np.minimum(np.float32(x) / pad[0],
+                                                       np.float32(w - 1 - x) / pad[2]),
+                                      1.0 - np.minimum(np.float32(y) / pad[1],
+                                                       np.float32(h - 1 - y) / pad[3]))
+                    blur = int(qsize * blur_ratio)
+                    if blur % 2 == 0:
+                        blur += 1
+                    blur_img = cv2.boxFilter(pad_img, 0, ksize=(blur, blur))
+                    # blur_img = cv2.GaussianBlur(pad_img, (blur, blur), 0)
+                    pad_img = pad_img.astype('float32')
+                    pad_img += (blur_img - pad_img) * np.clip(mask * 3.0 + 1.0, 0.0, 1.0)
+                    pad_img += (np.median(pad_img, axis=(0, 1)) - pad_img) * np.clip(mask, 0.0, 1.0)
+                    pad_img = np.clip(pad_img, 0, 255)  # float32, [0, 255]
+                    self.pad_input_imgs.append(pad_img)
+                else:
+                    self.pad_input_imgs.append(np.copy(self.input_img))
+        return len(self.all_landmarks_5)
+    def align_warp_face(self, save_cropped_path=None, border_mode='constant'):
+        """Align and warp faces with face template.
+        """
+        if self.pad_blur:
+            assert len(self.pad_input_imgs) == len(
+                self.all_landmarks_5), f'Mismatched samples: {len(self.pad_input_imgs)} and {len(self.all_landmarks_5)}'
+        for idx, landmark in enumerate(self.all_landmarks_5):
+            # use 5 landmarks to get affine matrix
+            # use cv2.LMEDS method for the equivalence to skimage transform
+            # ref: https://blog.csdn.net/yichxi/article/details/115827338
+            affine_matrix = cv2.estimateAffinePartial2D(landmark, self.face_template, method=cv2.LMEDS)[0]
+            self.affine_matrices.append(affine_matrix)
+            # warp and crop faces
+            if border_mode == 'constant':
+                border_mode = cv2.BORDER_CONSTANT
+            elif border_mode == 'reflect101':
+                border_mode = cv2.BORDER_REFLECT101
+            elif border_mode == 'reflect':
+                border_mode = cv2.BORDER_REFLECT
+            if self.pad_blur:
+                input_img = self.pad_input_imgs[idx]
+            else:
+                input_img = self.input_img
+            cropped_face = cv2.warpAffine(
+                input_img, affine_matrix, self.face_size, borderMode=border_mode, borderValue=(135, 133, 132))  # gray
+            self.cropped_faces.append(cropped_face)
+            # save the cropped face
+            if save_cropped_path is not None:
+                path = os.path.splitext(save_cropped_path)[0]
+                save_path = f'{path}_{idx:02d}.{self.save_ext}'
+                imwrite(cropped_face, save_path)
+    def get_inverse_affine(self, save_inverse_affine_path=None):
+        """Get inverse affine matrix."""
+        for idx, affine_matrix in enumerate(self.affine_matrices):
+            inverse_affine = cv2.invertAffineTransform(affine_matrix)
+            inverse_affine *= self.upscale_factor
+            self.inverse_affine_matrices.append(inverse_affine)
+            # save inverse affine matrices
+            if save_inverse_affine_path is not None:
+                path, _ = os.path.splitext(save_inverse_affine_path)
+                save_path = f'{path}_{idx:02d}.pth'
+                torch.save(inverse_affine, save_path)
+    def add_restored_face(self, restored_face, input_face=None):
+        # if self.is_gray:
+        #     restored_face = bgr2gray(restored_face) # convert img into grayscale
+        #     if input_face is not None:
+        #         restored_face = adain_npy(restored_face, input_face) # transfer the color
+        self.restored_faces.append(restored_face)
+    def paste_faces_to_input_image(self, save_path=None, upsample_img=None, draw_box=False, face_upsampler=None):
+        h, w, _ = self.input_img.shape
+        h_up, w_up = int(h * self.upscale_factor), int(w * self.upscale_factor)
+        if upsample_img is None:
+            # simply resize the background
+            # upsample_img = cv2.resize(self.input_img, (w_up, h_up), interpolation=cv2.INTER_LANCZOS4)
+            upsample_img = cv2.resize(self.input_img, (w_up, h_up), interpolation=cv2.INTER_LINEAR)
+        else:
+            upsample_img = cv2.resize(upsample_img, (w_up, h_up), interpolation=cv2.INTER_LANCZOS4)
+        assert len(self.restored_faces) == len(
+            self.inverse_affine_matrices), ('length of restored_faces and affine_matrices are different.')
+        inv_mask_borders = []
+        for restored_face, inverse_affine in zip(self.restored_faces, self.inverse_affine_matrices):
+            if face_upsampler is not None:
+                restored_face = face_upsampler.enhance(restored_face, outscale=self.upscale_factor)[0]
+                inverse_affine /= self.upscale_factor
+                inverse_affine[:, 2] *= self.upscale_factor
+                face_size = (self.face_size[0] * self.upscale_factor, self.face_size[1] * self.upscale_factor)
+            else:
+                # Add an offset to inverse affine matrix, for more precise back alignment
+                if self.upscale_factor > 1:
+                    extra_offset = 0.5 * self.upscale_factor
+                else:
+                    extra_offset = 0
+                inverse_affine[:, 2] += extra_offset
+                face_size = self.face_size
+            inv_restored = cv2.warpAffine(restored_face, inverse_affine, (w_up, h_up))
+            # if draw_box or not self.use_parse:  # use square parse maps
+            #     mask = np.ones(face_size, dtype=np.float32)
+            #     inv_mask = cv2.warpAffine(mask, inverse_affine, (w_up, h_up))
+            #     # remove the black borders
+            #     inv_mask_erosion = cv2.erode(
+            #         inv_mask, np.ones((int(2 * self.upscale_factor), int(2 * self.upscale_factor)), np.uint8))
+            #     pasted_face = inv_mask_erosion[:, :, None] * inv_restored
+            #     total_face_area = np.sum(inv_mask_erosion)  # // 3
+            #     # add border
+            #     if draw_box:
+            #         h, w = face_size
+            #         mask_border = np.ones((h, w, 3), dtype=np.float32)
+            #         border = int(1400/np.sqrt(total_face_area))
+            #         mask_border[border:h-border, border:w-border,:] = 0
+            #         inv_mask_border = cv2.warpAffine(mask_border, inverse_affine, (w_up, h_up))
+            #         inv_mask_borders.append(inv_mask_border)
+            #     if not self.use_parse:
+            #         # compute the fusion edge based on the area of face
+            #         w_edge = int(total_face_area**0.5) // 20
+            #         erosion_radius = w_edge * 2
+            #         inv_mask_center = cv2.erode(inv_mask_erosion, np.ones((erosion_radius, erosion_radius), np.uint8))
+            #         blur_size = w_edge * 2
+            #         inv_soft_mask = cv2.GaussianBlur(inv_mask_center, (blur_size + 1, blur_size + 1), 0)
+            #         if len(upsample_img.shape) == 2:  # upsample_img is gray image
+            #             upsample_img = upsample_img[:, :, None]
+            #         inv_soft_mask = inv_soft_mask[:, :, None]
+            # always use square mask
+            mask = np.ones(face_size, dtype=np.float32)
+            inv_mask = cv2.warpAffine(mask, inverse_affine, (w_up, h_up))
+            # remove the black borders
+            inv_mask_erosion = cv2.erode(
+                inv_mask, np.ones((int(2 * self.upscale_factor), int(2 * self.upscale_factor)), np.uint8))
+            pasted_face = inv_mask_erosion[:, :, None] * inv_restored
+            total_face_area = np.sum(inv_mask_erosion)  # // 3
+            # add border
+            if draw_box:
+                h, w = face_size
+                mask_border = np.ones((h, w, 3), dtype=np.float32)
+                border = int(1400 / np.sqrt(total_face_area))
+                mask_border[border:h - border, border:w - border, :] = 0
+                inv_mask_border = cv2.warpAffine(mask_border, inverse_affine, (w_up, h_up))
+                inv_mask_borders.append(inv_mask_border)
+            # compute the fusion edge based on the area of face
+            w_edge = int(total_face_area ** 0.5) // 20
+            erosion_radius = w_edge * 2
+            inv_mask_center = cv2.erode(inv_mask_erosion, np.ones((erosion_radius, erosion_radius), np.uint8))
+            blur_size = w_edge * 2
+            inv_soft_mask = cv2.GaussianBlur(inv_mask_center, (blur_size + 1, blur_size + 1), 0)
+            if len(upsample_img.shape) == 2:  # upsample_img is gray image
+                upsample_img = upsample_img[:, :, None]
+            inv_soft_mask = inv_soft_mask[:, :, None]
+            # parse mask
+            if self.use_parse:
+                # inference
+                face_input = cv2.resize(restored_face, (512, 512), interpolation=cv2.INTER_LINEAR)
+                face_input = img2tensor(face_input.astype('float32') / 255., bgr2rgb=True, float32=True)
+                normalize(face_input, (0.5, 0.5, 0.5), (0.5, 0.5, 0.5), inplace=True)
+                face_input = torch.unsqueeze(face_input, 0).to(self.device)
+                with torch.no_grad():
+                    out = self.face_parse(face_input)[0]
+                out = out.argmax(dim=1).squeeze().cpu().numpy()
+                parse_mask = np.zeros(out.shape)
+                MASK_COLORMAP = [0, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0, 255, 0, 0, 0]
+                for idx, color in enumerate(MASK_COLORMAP):
+                    parse_mask[out == idx] = color
+                #  blur the mask
+                parse_mask = cv2.GaussianBlur(parse_mask, (101, 101), 11)
+                parse_mask = cv2.GaussianBlur(parse_mask, (101, 101), 11)
+                # remove the black borders
+                thres = 10
+                parse_mask[:thres, :] = 0
+                parse_mask[-thres:, :] = 0
+                parse_mask[:, :thres] = 0
+                parse_mask[:, -thres:] = 0
+                parse_mask = parse_mask / 255.
+                parse_mask = cv2.resize(parse_mask, face_size)
+                parse_mask = cv2.warpAffine(parse_mask, inverse_affine, (w_up, h_up), flags=3)
+                inv_soft_parse_mask = parse_mask[:, :, None]
+                # pasted_face = inv_restored
+                fuse_mask = (inv_soft_parse_mask < inv_soft_mask).astype('int')
+                inv_soft_mask = inv_soft_parse_mask * fuse_mask + inv_soft_mask * (1 - fuse_mask)
+            if len(upsample_img.shape) == 3 and upsample_img.shape[2] == 4:  # alpha channel
+                alpha = upsample_img[:, :, 3:]
+                upsample_img = inv_soft_mask * pasted_face + (1 - inv_soft_mask) * upsample_img[:, :, 0:3]
+                upsample_img = np.concatenate((upsample_img, alpha), axis=2)
+            else:
+                upsample_img = inv_soft_mask * pasted_face + (1 - inv_soft_mask) * upsample_img
+        if np.max(upsample_img) > 256:  # 16-bit image
+            upsample_img = upsample_img.astype(np.uint16)
+        else:
+            upsample_img = upsample_img.astype(np.uint8)
+        # draw bounding box
+        if draw_box:
+            # upsample_input_img = cv2.resize(input_img, (w_up, h_up))
+            img_color = np.ones([*upsample_img.shape], dtype=np.float32)
+            img_color[:, :, 0] = 0
+            img_color[:, :, 1] = 255
+            img_color[:, :, 2] = 0
+            for inv_mask_border in inv_mask_borders:
+                upsample_img = inv_mask_border * img_color + (1 - inv_mask_border) * upsample_img
+                # upsample_input_img = inv_mask_border * img_color + (1 - inv_mask_border) * upsample_input_img
+        if save_path is not None:
+            path = os.path.splitext(save_path)[0]
+            save_path = f'{path}.{self.save_ext}'
+            imwrite(upsample_img, save_path)
+        return upsample_img
+    def clean_all(self):
+        self.all_landmarks_5 = []
+        self.restored_faces = []
+        self.affine_matrices = []
+        self.cropped_faces = []
+        self.inverse_affine_matrices = []
+        self.det_faces = []
+        self.pad_input_imgs = []

SUPIR/utils/file.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import os
+from typing import List, Tuple
+from urllib.parse import urlparse
+from torch.hub import download_url_to_file, get_dir
+def load_file_list(file_list_path: str) -> List[str]:
+    files = []
+    # each line in file list contains a path of an image
+    with open(file_list_path, "r") as fin:
+        for line in fin:
+            path = line.strip()
+            if path:
+                files.append(path)
+    return files
+def list_image_files(
+    img_dir: str,
+    exts: Tuple[str]=(".jpg", ".png", ".jpeg"),
+    follow_links: bool=False,
+    log_progress: bool=False,
+    log_every_n_files: int=10000,
+    max_size: int=-1
+) -> List[str]:
+    files = []
+    for dir_path, _, file_names in os.walk(img_dir, followlinks=follow_links):
+        early_stop = False
+        for file_name in file_names:
+            if os.path.splitext(file_name)[1].lower() in exts:
+                if max_size >= 0 and len(files) >= max_size:
+                    early_stop = True
+                    break
+                files.append(os.path.join(dir_path, file_name))
+                if log_progress and len(files) % log_every_n_files == 0:
+                    print(f"find {len(files)} images in {img_dir}")
+        if early_stop:
+            break
+    return files
+def get_file_name_parts(file_path: str) -> Tuple[str, str, str]:
+    parent_path, file_name = os.path.split(file_path)
+    stem, ext = os.path.splitext(file_name)
+    return parent_path, stem, ext
+# https://github.com/XPixelGroup/BasicSR/blob/master/basicsr/utils/download_util.py/
+def load_file_from_url(url, model_dir=None, progress=True, file_name=None):
+    """Load file form http url, will download models if necessary.
+    Ref:https://github.com/1adrianb/face-alignment/blob/master/face_alignment/utils.py
+    Args:
+        url (str): URL to be downloaded.
+        model_dir (str): The path to save the downloaded model. Should be a full path. If None, use pytorch hub_dir.
+            Default: None.
+        progress (bool): Whether to show the download progress. Default: True.
+        file_name (str): The downloaded file name. If None, use the file name in the url. Default: None.
+    Returns:
+        str: The path to the downloaded file.
+    """
+    if model_dir is None:  # use the pytorch hub_dir
+        hub_dir = get_dir()
+        model_dir = os.path.join(hub_dir, 'checkpoints')
+    os.makedirs(model_dir, exist_ok=True)
+    parts = urlparse(url)
+    filename = os.path.basename(parts.path)
+    if file_name is not None:
+        filename = file_name
+    cached_file = os.path.abspath(os.path.join(model_dir, filename))
+    if not os.path.exists(cached_file):
+        print(f'Downloading: "{url}" to {cached_file}\n')
+        download_url_to_file(url, cached_file, hash_prefix=None, progress=progress)
+    return cached_file

SUPIR/utils/tilevae.py ADDED Viewed

	@@ -0,0 +1,971 @@

+# ------------------------------------------------------------------------
+#
+#   Ultimate VAE Tile Optimization
+#
+#   Introducing a revolutionary new optimization designed to make
+#   the VAE work with giant images on limited VRAM!
+#   Say goodbye to the frustration of OOM and hello to seamless output!
+#
+# ------------------------------------------------------------------------
+#
+#   This script is a wild hack that splits the image into tiles,
+#   encodes each tile separately, and merges the result back together.
+#
+#   Advantages:
+#   - The VAE can now work with giant images on limited VRAM
+#       (~10 GB for 8K images!)
+#   - The merged output is completely seamless without any post-processing.
+#
+#   Drawbacks:
+#   - Giant RAM needed. To store the intermediate results for a 4096x4096
+#       images, you need 32 GB RAM it consumes ~20GB); for 8192x8192
+#       you need 128 GB RAM machine (it consumes ~100 GB)
+#   - NaNs always appear in for 8k images when you use fp16 (half) VAE
+#       You must use --no-half-vae to disable half VAE for that giant image.
+#   - Slow speed. With default tile size, it takes around 50/200 seconds
+#       to encode/decode a 4096x4096 image; and 200/900 seconds to encode/decode
+#       a 8192x8192 image. (The speed is limited by both the GPU and the CPU.)
+#   - The gradient calculation is not compatible with this hack. It
+#       will break any backward() or torch.autograd.grad() that passes VAE.
+#       (But you can still use the VAE to generate training data.)
+#
+#   How it works:
+#   1) The image is split into tiles.
+#       - To ensure perfect results, each tile is padded with 32 pixels
+#           on each side.
+#       - Then the conv2d/silu/upsample/downsample can produce identical
+#           results to the original image without splitting.
+#   2) The original forward is decomposed into a task queue and a task worker.
+#       - The task queue is a list of functions that will be executed in order.
+#       - The task worker is a loop that executes the tasks in the queue.
+#   3) The task queue is executed for each tile.
+#       - Current tile is sent to GPU.
+#       - local operations are directly executed.
+#       - Group norm calculation is temporarily suspended until the mean
+#           and var of all tiles are calculated.
+#       - The residual is pre-calculated and stored and addded back later.
+#       - When need to go to the next tile, the current tile is send to cpu.
+#   4) After all tiles are processed, tiles are merged on cpu and return.
+#
+#   Enjoy!
+#
+#   @author: LI YI @ Nanyang Technological University - Singapore
+#   @date: 2023-03-02
+#   @license: MIT License
+#
+#   Please give me a star if you like this project!
+#
+# -------------------------------------------------------------------------
+import gc
+from time import time
+import math
+from tqdm import tqdm
+import torch
+import torch.version
+import torch.nn.functional as F
+from einops import rearrange
+from diffusers.utils.import_utils import is_xformers_available
+import SUPIR.utils.devices as devices
+try:
+    import xformers
+    import xformers.ops
+except ImportError:
+    pass
+sd_flag = True
+def get_recommend_encoder_tile_size():
+    if torch.cuda.is_available():
+        total_memory = torch.cuda.get_device_properties(
+            devices.device).total_memory // 2**20
+        if total_memory > 16*1000:
+            ENCODER_TILE_SIZE = 3072
+        elif total_memory > 12*1000:
+            ENCODER_TILE_SIZE = 2048
+        elif total_memory > 8*1000:
+            ENCODER_TILE_SIZE = 1536
+        else:
+            ENCODER_TILE_SIZE = 960
+    else:
+        ENCODER_TILE_SIZE = 512
+    return ENCODER_TILE_SIZE
+def get_recommend_decoder_tile_size():
+    if torch.cuda.is_available():
+        total_memory = torch.cuda.get_device_properties(
+            devices.device).total_memory // 2**20
+        if total_memory > 30*1000:
+            DECODER_TILE_SIZE = 256
+        elif total_memory > 16*1000:
+            DECODER_TILE_SIZE = 192
+        elif total_memory > 12*1000:
+            DECODER_TILE_SIZE = 128
+        elif total_memory > 8*1000:
+            DECODER_TILE_SIZE = 96
+        else:
+            DECODER_TILE_SIZE = 64
+    else:
+        DECODER_TILE_SIZE = 64
+    return DECODER_TILE_SIZE
+if 'global const':
+    DEFAULT_ENABLED = False
+    DEFAULT_MOVE_TO_GPU = False
+    DEFAULT_FAST_ENCODER = True
+    DEFAULT_FAST_DECODER = True
+    DEFAULT_COLOR_FIX = 0
+    DEFAULT_ENCODER_TILE_SIZE = get_recommend_encoder_tile_size()
+    DEFAULT_DECODER_TILE_SIZE = get_recommend_decoder_tile_size()
+# inplace version of silu
+def inplace_nonlinearity(x):
+    # Test: fix for Nans
+    return F.silu(x, inplace=True)
+# extracted from ldm.modules.diffusionmodules.model
+# from diffusers lib
+def attn_forward_new(self, h_):
+    batch_size, channel, height, width = h_.shape
+    hidden_states = h_.view(batch_size, channel, height * width).transpose(1, 2)
+    attention_mask = None
+    encoder_hidden_states = None
+    batch_size, sequence_length, _ = hidden_states.shape
+    attention_mask = self.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+    query = self.to_q(hidden_states)
+    if encoder_hidden_states is None:
+        encoder_hidden_states = hidden_states
+    elif self.norm_cross:
+        encoder_hidden_states = self.norm_encoder_hidden_states(encoder_hidden_states)
+    key = self.to_k(encoder_hidden_states)
+    value = self.to_v(encoder_hidden_states)
+    query = self.head_to_batch_dim(query)
+    key = self.head_to_batch_dim(key)
+    value = self.head_to_batch_dim(value)
+    attention_probs = self.get_attention_scores(query, key, attention_mask)
+    hidden_states = torch.bmm(attention_probs, value)
+    hidden_states = self.batch_to_head_dim(hidden_states)
+    # linear proj
+    hidden_states = self.to_out[0](hidden_states)
+    # dropout
+    hidden_states = self.to_out[1](hidden_states)
+    hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+    return hidden_states
+def attn_forward_new_pt2_0(self, hidden_states,):
+    scale = 1
+    attention_mask = None
+    encoder_hidden_states = None
+    input_ndim = hidden_states.ndim
+    if input_ndim == 4:
+        batch_size, channel, height, width = hidden_states.shape
+        hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+    batch_size, sequence_length, _ = (
+        hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+    )
+    if attention_mask is not None:
+        attention_mask = self.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        # scaled_dot_product_attention expects attention_mask shape to be
+        # (batch, heads, source_length, target_length)
+        attention_mask = attention_mask.view(batch_size, self.heads, -1, attention_mask.shape[-1])
+    if self.group_norm is not None:
+        hidden_states = self.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+    query = self.to_q(hidden_states, scale=scale)
+    if encoder_hidden_states is None:
+        encoder_hidden_states = hidden_states
+    elif self.norm_cross:
+        encoder_hidden_states = self.norm_encoder_hidden_states(encoder_hidden_states)
+    key = self.to_k(encoder_hidden_states, scale=scale)
+    value = self.to_v(encoder_hidden_states, scale=scale)
+    inner_dim = key.shape[-1]
+    head_dim = inner_dim // self.heads
+    query = query.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
+    key = key.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
+    value = value.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
+    # the output of sdp = (batch, num_heads, seq_len, head_dim)
+    # TODO: add support for attn.scale when we move to Torch 2.1
+    hidden_states = F.scaled_dot_product_attention(
+        query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+    )
+    hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.heads * head_dim)
+    hidden_states = hidden_states.to(query.dtype)
+    # linear proj
+    hidden_states = self.to_out[0](hidden_states, scale=scale)
+    # dropout
+    hidden_states = self.to_out[1](hidden_states)
+    if input_ndim == 4:
+        hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+    return hidden_states
+def attn_forward_new_xformers(self, hidden_states):
+    scale = 1
+    attention_op = None
+    attention_mask = None
+    encoder_hidden_states = None
+    input_ndim = hidden_states.ndim
+    if input_ndim == 4:
+        batch_size, channel, height, width = hidden_states.shape
+        hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+    batch_size, key_tokens, _ = (
+        hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+    )
+    attention_mask = self.prepare_attention_mask(attention_mask, key_tokens, batch_size)
+    if attention_mask is not None:
+        # expand our mask's singleton query_tokens dimension:
+        #   [batch*heads,            1, key_tokens] ->
+        #   [batch*heads, query_tokens, key_tokens]
+        # so that it can be added as a bias onto the attention scores that xformers computes:
+        #   [batch*heads, query_tokens, key_tokens]
+        # we do this explicitly because xformers doesn't broadcast the singleton dimension for us.
+        _, query_tokens, _ = hidden_states.shape
+        attention_mask = attention_mask.expand(-1, query_tokens, -1)
+    if self.group_norm is not None:
+        hidden_states = self.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+    query = self.to_q(hidden_states, scale=scale)
+    if encoder_hidden_states is None:
+        encoder_hidden_states = hidden_states
+    elif self.norm_cross:
+        encoder_hidden_states = self.norm_encoder_hidden_states(encoder_hidden_states)
+    key = self.to_k(encoder_hidden_states, scale=scale)
+    value = self.to_v(encoder_hidden_states, scale=scale)
+    query = self.head_to_batch_dim(query).contiguous()
+    key = self.head_to_batch_dim(key).contiguous()
+    value = self.head_to_batch_dim(value).contiguous()
+    hidden_states = xformers.ops.memory_efficient_attention(
+        query, key, value, attn_bias=attention_mask, op=attention_op#, scale=scale
+    )
+    hidden_states = hidden_states.to(query.dtype)
+    hidden_states = self.batch_to_head_dim(hidden_states)
+    # linear proj
+    hidden_states = self.to_out[0](hidden_states, scale=scale)
+    # dropout
+    hidden_states = self.to_out[1](hidden_states)
+    if input_ndim == 4:
+        hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+    return hidden_states
+def attn_forward(self, h_):
+    q = self.q(h_)
+    k = self.k(h_)
+    v = self.v(h_)
+    # compute attention
+    b, c, h, w = q.shape
+    q = q.reshape(b, c, h*w)
+    q = q.permute(0, 2, 1)   # b,hw,c
+    k = k.reshape(b, c, h*w)  # b,c,hw
+    w_ = torch.bmm(q, k)     # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+    w_ = w_ * (int(c)**(-0.5))
+    w_ = torch.nn.functional.softmax(w_, dim=2)
+    # attend to values
+    v = v.reshape(b, c, h*w)
+    w_ = w_.permute(0, 2, 1)   # b,hw,hw (first hw of k, second of q)
+    # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+    h_ = torch.bmm(v, w_)
+    h_ = h_.reshape(b, c, h, w)
+    h_ = self.proj_out(h_)
+    return h_
+def xformer_attn_forward(self, h_):
+    q = self.q(h_)
+    k = self.k(h_)
+    v = self.v(h_)
+    # compute attention
+    B, C, H, W = q.shape
+    q, k, v = map(lambda x: rearrange(x, 'b c h w -> b (h w) c'), (q, k, v))
+    q, k, v = map(
+        lambda t: t.unsqueeze(3)
+        .reshape(B, t.shape[1], 1, C)
+        .permute(0, 2, 1, 3)
+        .reshape(B * 1, t.shape[1], C)
+        .contiguous(),
+        (q, k, v),
+    )
+    out = xformers.ops.memory_efficient_attention(
+        q, k, v, attn_bias=None, op=self.attention_op)
+    out = (
+        out.unsqueeze(0)
+        .reshape(B, 1, out.shape[1], C)
+        .permute(0, 2, 1, 3)
+        .reshape(B, out.shape[1], C)
+    )
+    out = rearrange(out, 'b (h w) c -> b c h w', b=B, h=H, w=W, c=C)
+    out = self.proj_out(out)
+    return out
+def attn2task(task_queue, net):
+    if False: #isinstance(net, AttnBlock):
+        task_queue.append(('store_res', lambda x: x))
+        task_queue.append(('pre_norm', net.norm))
+        task_queue.append(('attn', lambda x, net=net: attn_forward(net, x)))
+        task_queue.append(['add_res', None])
+    elif False: #isinstance(net, MemoryEfficientAttnBlock):
+        task_queue.append(('store_res', lambda x: x))
+        task_queue.append(('pre_norm', net.norm))
+        task_queue.append(
+            ('attn', lambda x, net=net: xformer_attn_forward(net, x)))
+        task_queue.append(['add_res', None])
+    else:
+        task_queue.append(('store_res', lambda x: x))
+        task_queue.append(('pre_norm', net.norm))
+        if is_xformers_available:
+            # task_queue.append(('attn', lambda x, net=net: attn_forward_new_xformers(net, x)))
+            task_queue.append(
+                ('attn', lambda x, net=net: xformer_attn_forward(net, x)))
+        elif hasattr(F, "scaled_dot_product_attention"):
+            task_queue.append(('attn', lambda x, net=net: attn_forward_new_pt2_0(net, x)))
+        else:
+            task_queue.append(('attn', lambda x, net=net: attn_forward_new(net, x)))
+        task_queue.append(['add_res', None])
+def resblock2task(queue, block):
+    """
+    Turn a ResNetBlock into a sequence of tasks and append to the task queue
+    @param queue: the target task queue
+    @param block: ResNetBlock
+    """
+    if block.in_channels != block.out_channels:
+        if sd_flag:
+            if block.use_conv_shortcut:
+                queue.append(('store_res', block.conv_shortcut))
+            else:
+                queue.append(('store_res', block.nin_shortcut))
+        else:
+            if block.use_in_shortcut:
+                queue.append(('store_res', block.conv_shortcut))
+            else:
+                queue.append(('store_res', block.nin_shortcut))
+    else:
+        queue.append(('store_res', lambda x: x))
+    queue.append(('pre_norm', block.norm1))
+    queue.append(('silu', inplace_nonlinearity))
+    queue.append(('conv1', block.conv1))
+    queue.append(('pre_norm', block.norm2))
+    queue.append(('silu', inplace_nonlinearity))
+    queue.append(('conv2', block.conv2))
+    queue.append(['add_res', None])
+def build_sampling(task_queue, net, is_decoder):
+    """
+    Build the sampling part of a task queue
+    @param task_queue: the target task queue
+    @param net: the network
+    @param is_decoder: currently building decoder or encoder
+    """
+    if is_decoder:
+        if sd_flag:
+            resblock2task(task_queue, net.mid.block_1)
+            attn2task(task_queue, net.mid.attn_1)
+            print(task_queue)
+            resblock2task(task_queue, net.mid.block_2)
+            resolution_iter = reversed(range(net.num_resolutions))
+            block_ids = net.num_res_blocks + 1
+            condition = 0
+            module = net.up
+            func_name = 'upsample'
+        else:
+            resblock2task(task_queue, net.mid_block.resnets[0])
+            attn2task(task_queue, net.mid_block.attentions[0])
+            resblock2task(task_queue, net.mid_block.resnets[1])
+            resolution_iter = (range(len(net.up_blocks)))  # net.num_resolutions = 3
+            block_ids = 2 + 1
+            condition = len(net.up_blocks) - 1
+            module = net.up_blocks
+            func_name = 'upsamplers'
+    else:
+        if sd_flag:
+            resolution_iter = range(net.num_resolutions)
+            block_ids = net.num_res_blocks
+            condition = net.num_resolutions - 1
+            module = net.down
+            func_name = 'downsample'
+        else:
+            resolution_iter = range(len(net.down_blocks))
+            block_ids = 2
+            condition = len(net.down_blocks) - 1
+            module = net.down_blocks
+            func_name = 'downsamplers'
+    for i_level in resolution_iter:
+        for i_block in range(block_ids):
+            if sd_flag:
+                resblock2task(task_queue, module[i_level].block[i_block])
+            else:
+                resblock2task(task_queue, module[i_level].resnets[i_block])
+        if i_level != condition:
+            if sd_flag:
+                task_queue.append((func_name, getattr(module[i_level], func_name)))
+            else:
+                if is_decoder:
+                    task_queue.append((func_name, module[i_level].upsamplers[0]))
+                else:
+                    task_queue.append((func_name, module[i_level].downsamplers[0]))
+    if not is_decoder:
+        if sd_flag:
+            resblock2task(task_queue, net.mid.block_1)
+            attn2task(task_queue, net.mid.attn_1)
+            resblock2task(task_queue, net.mid.block_2)
+        else:
+            resblock2task(task_queue, net.mid_block.resnets[0])
+            attn2task(task_queue, net.mid_block.attentions[0])
+            resblock2task(task_queue, net.mid_block.resnets[1])
+def build_task_queue(net, is_decoder):
+    """
+    Build a single task queue for the encoder or decoder
+    @param net: the VAE decoder or encoder network
+    @param is_decoder: currently building decoder or encoder
+    @return: the task queue
+    """
+    task_queue = []
+    task_queue.append(('conv_in', net.conv_in))
+    # construct the sampling part of the task queue
+    # because encoder and decoder share the same architecture, we extract the sampling part
+    build_sampling(task_queue, net, is_decoder)
+    if is_decoder and not sd_flag:
+        net.give_pre_end = False
+        net.tanh_out = False
+    if not is_decoder or not net.give_pre_end:
+        if sd_flag:
+            task_queue.append(('pre_norm', net.norm_out))
+        else:
+            task_queue.append(('pre_norm', net.conv_norm_out))
+        task_queue.append(('silu', inplace_nonlinearity))
+        task_queue.append(('conv_out', net.conv_out))
+        if is_decoder and net.tanh_out:
+            task_queue.append(('tanh', torch.tanh))
+    return task_queue
+def clone_task_queue(task_queue):
+    """
+    Clone a task queue
+    @param task_queue: the task queue to be cloned
+    @return: the cloned task queue
+    """
+    return [[item for item in task] for task in task_queue]
+def get_var_mean(input, num_groups, eps=1e-6):
+    """
+    Get mean and var for group norm
+    """
+    b, c = input.size(0), input.size(1)
+    channel_in_group = int(c/num_groups)
+    input_reshaped = input.contiguous().view(
+        1, int(b * num_groups), channel_in_group, *input.size()[2:])
+    var, mean = torch.var_mean(
+        input_reshaped, dim=[0, 2, 3, 4], unbiased=False)
+    return var, mean
+def custom_group_norm(input, num_groups, mean, var, weight=None, bias=None, eps=1e-6):
+    """
+    Custom group norm with fixed mean and var
+    @param input: input tensor
+    @param num_groups: number of groups. by default, num_groups = 32
+    @param mean: mean, must be pre-calculated by get_var_mean
+    @param var: var, must be pre-calculated by get_var_mean
+    @param weight: weight, should be fetched from the original group norm
+    @param bias: bias, should be fetched from the original group norm
+    @param eps: epsilon, by default, eps = 1e-6 to match the original group norm
+    @return: normalized tensor
+    """
+    b, c = input.size(0), input.size(1)
+    channel_in_group = int(c/num_groups)
+    input_reshaped = input.contiguous().view(
+        1, int(b * num_groups), channel_in_group, *input.size()[2:])
+    out = F.batch_norm(input_reshaped, mean, var, weight=None, bias=None,
+                       training=False, momentum=0, eps=eps)
+    out = out.view(b, c, *input.size()[2:])
+    # post affine transform
+    if weight is not None:
+        out *= weight.view(1, -1, 1, 1)
+    if bias is not None:
+        out += bias.view(1, -1, 1, 1)
+    return out
+def crop_valid_region(x, input_bbox, target_bbox, is_decoder):
+    """
+    Crop the valid region from the tile
+    @param x: input tile
+    @param input_bbox: original input bounding box
+    @param target_bbox: output bounding box
+    @param scale: scale factor
+    @return: cropped tile
+    """
+    padded_bbox = [i * 8 if is_decoder else i//8 for i in input_bbox]
+    margin = [target_bbox[i] - padded_bbox[i] for i in range(4)]
+    return x[:, :, margin[2]:x.size(2)+margin[3], margin[0]:x.size(3)+margin[1]]
+# ↓↓↓ https://github.com/Kahsolt/stable-diffusion-webui-vae-tile-infer ↓↓↓
+def perfcount(fn):
+    def wrapper(*args, **kwargs):
+        ts = time()
+        if torch.cuda.is_available():
+            torch.cuda.reset_peak_memory_stats(devices.device)
+        devices.torch_gc()
+        gc.collect()
+        ret = fn(*args, **kwargs)
+        devices.torch_gc()
+        gc.collect()
+        if torch.cuda.is_available():
+            vram = torch.cuda.max_memory_allocated(devices.device) / 2**20
+            torch.cuda.reset_peak_memory_stats(devices.device)
+            print(
+                f'[Tiled VAE]: Done in {time() - ts:.3f}s, max VRAM alloc {vram:.3f} MB')
+        else:
+            print(f'[Tiled VAE]: Done in {time() - ts:.3f}s')
+        return ret
+    return wrapper
+# copy end :)
+class GroupNormParam:
+    def __init__(self):
+        self.var_list = []
+        self.mean_list = []
+        self.pixel_list = []
+        self.weight = None
+        self.bias = None
+    def add_tile(self, tile, layer):
+        var, mean = get_var_mean(tile, 32)
+        # For giant images, the variance can be larger than max float16
+        # In this case we create a copy to float32
+        if var.dtype == torch.float16 and var.isinf().any():
+            fp32_tile = tile.float()
+            var, mean = get_var_mean(fp32_tile, 32)
+        # ============= DEBUG: test for infinite =============
+        # if torch.isinf(var).any():
+        #    print('var: ', var)
+        # ====================================================
+        self.var_list.append(var)
+        self.mean_list.append(mean)
+        self.pixel_list.append(
+            tile.shape[2]*tile.shape[3])
+        if hasattr(layer, 'weight'):
+            self.weight = layer.weight
+            self.bias = layer.bias
+        else:
+            self.weight = None
+            self.bias = None
+    def summary(self):
+        """
+        summarize the mean and var and return a function
+        that apply group norm on each tile
+        """
+        if len(self.var_list) == 0:
+            return None
+        var = torch.vstack(self.var_list)
+        mean = torch.vstack(self.mean_list)
+        max_value = max(self.pixel_list)
+        pixels = torch.tensor(
+            self.pixel_list, dtype=torch.float32, device=devices.device) / max_value
+        sum_pixels = torch.sum(pixels)
+        pixels = pixels.unsqueeze(
+            1) / sum_pixels
+        var = torch.sum(
+            var * pixels, dim=0)
+        mean = torch.sum(
+            mean * pixels, dim=0)
+        return lambda x:  custom_group_norm(x, 32, mean, var, self.weight, self.bias)
+    @staticmethod
+    def from_tile(tile, norm):
+        """
+        create a function from a single tile without summary
+        """
+        var, mean = get_var_mean(tile, 32)
+        if var.dtype == torch.float16 and var.isinf().any():
+            fp32_tile = tile.float()
+            var, mean = get_var_mean(fp32_tile, 32)
+            # if it is a macbook, we need to convert back to float16
+            if var.device.type == 'mps':
+                # clamp to avoid overflow
+                var = torch.clamp(var, 0, 60000)
+                var = var.half()
+                mean = mean.half()
+        if hasattr(norm, 'weight'):
+            weight = norm.weight
+            bias = norm.bias
+        else:
+            weight = None
+            bias = None
+        def group_norm_func(x, mean=mean, var=var, weight=weight, bias=bias):
+            return custom_group_norm(x, 32, mean, var, weight, bias, 1e-6)
+        return group_norm_func
+class VAEHook:
+    def __init__(self, net, tile_size, is_decoder, fast_decoder, fast_encoder, color_fix, to_gpu=False):
+        self.net = net                  # encoder | decoder
+        self.tile_size = tile_size
+        self.is_decoder = is_decoder
+        self.fast_mode = (fast_encoder and not is_decoder) or (
+            fast_decoder and is_decoder)
+        self.color_fix = color_fix and not is_decoder
+        self.to_gpu = to_gpu
+        self.pad = 11 if is_decoder else 32
+    def __call__(self, x):
+        B, C, H, W = x.shape
+        original_device = next(self.net.parameters()).device
+        try:
+            if self.to_gpu:
+                self.net.to(devices.get_optimal_device())
+            if max(H, W) <= self.pad * 2 + self.tile_size:
+                print("[Tiled VAE]: the input size is tiny and unnecessary to tile.")
+                return self.net.original_forward(x)
+            else:
+                return self.vae_tile_forward(x)
+        finally:
+            self.net.to(original_device)
+    def get_best_tile_size(self, lowerbound, upperbound):
+        """
+        Get the best tile size for GPU memory
+        """
+        divider = 32
+        while divider >= 2:
+            remainer = lowerbound % divider
+            if remainer == 0:
+                return lowerbound
+            candidate = lowerbound - remainer + divider
+            if candidate <= upperbound:
+                return candidate
+            divider //= 2
+        return lowerbound
+    def split_tiles(self, h, w):
+        """
+        Tool function to split the image into tiles
+        @param h: height of the image
+        @param w: width of the image
+        @return: tile_input_bboxes, tile_output_bboxes
+        """
+        tile_input_bboxes, tile_output_bboxes = [], []
+        tile_size = self.tile_size
+        pad = self.pad
+        num_height_tiles = math.ceil((h - 2 * pad) / tile_size)
+        num_width_tiles = math.ceil((w - 2 * pad) / tile_size)
+        # If any of the numbers are 0, we let it be 1
+        # This is to deal with long and thin images
+        num_height_tiles = max(num_height_tiles, 1)
+        num_width_tiles = max(num_width_tiles, 1)
+        # Suggestions from https://github.com/Kahsolt: auto shrink the tile size
+        real_tile_height = math.ceil((h - 2 * pad) / num_height_tiles)
+        real_tile_width = math.ceil((w - 2 * pad) / num_width_tiles)
+        real_tile_height = self.get_best_tile_size(real_tile_height, tile_size)
+        real_tile_width = self.get_best_tile_size(real_tile_width, tile_size)
+        print(f'[Tiled VAE]: split to {num_height_tiles}x{num_width_tiles} = {num_height_tiles*num_width_tiles} tiles. ' +
+              f'Optimal tile size {real_tile_width}x{real_tile_height}, original tile size {tile_size}x{tile_size}')
+        for i in range(num_height_tiles):
+            for j in range(num_width_tiles):
+                # bbox: [x1, x2, y1, y2]
+                # the padding is is unnessary for image borders. So we directly start from (32, 32)
+                input_bbox = [
+                    pad + j * real_tile_width,
+                    min(pad + (j + 1) * real_tile_width, w),
+                    pad + i * real_tile_height,
+                    min(pad + (i + 1) * real_tile_height, h),
+                ]
+                # if the output bbox is close to the image boundary, we extend it to the image boundary
+                output_bbox = [
+                    input_bbox[0] if input_bbox[0] > pad else 0,
+                    input_bbox[1] if input_bbox[1] < w - pad else w,
+                    input_bbox[2] if input_bbox[2] > pad else 0,
+                    input_bbox[3] if input_bbox[3] < h - pad else h,
+                ]
+                # scale to get the final output bbox
+                output_bbox = [x * 8 if self.is_decoder else x // 8 for x in output_bbox]
+                tile_output_bboxes.append(output_bbox)
+                # indistinguishable expand the input bbox by pad pixels
+                tile_input_bboxes.append([
+                    max(0, input_bbox[0] - pad),
+                    min(w, input_bbox[1] + pad),
+                    max(0, input_bbox[2] - pad),
+                    min(h, input_bbox[3] + pad),
+                ])
+        return tile_input_bboxes, tile_output_bboxes
+    @torch.no_grad()
+    def estimate_group_norm(self, z, task_queue, color_fix):
+        device = z.device
+        tile = z
+        last_id = len(task_queue) - 1
+        while last_id >= 0 and task_queue[last_id][0] != 'pre_norm':
+            last_id -= 1
+        if last_id <= 0 or task_queue[last_id][0] != 'pre_norm':
+            raise ValueError('No group norm found in the task queue')
+        # estimate until the last group norm
+        for i in range(last_id + 1):
+            task = task_queue[i]
+            if task[0] == 'pre_norm':
+                group_norm_func = GroupNormParam.from_tile(tile, task[1])
+                task_queue[i] = ('apply_norm', group_norm_func)
+                if i == last_id:
+                    return True
+                tile = group_norm_func(tile)
+            elif task[0] == 'store_res':
+                task_id = i + 1
+                while task_id < last_id and task_queue[task_id][0] != 'add_res':
+                    task_id += 1
+                if task_id >= last_id:
+                    continue
+                task_queue[task_id][1] = task[1](tile)
+            elif task[0] == 'add_res':
+                tile += task[1].to(device)
+                task[1] = None
+            elif color_fix and task[0] == 'downsample':
+                for j in range(i, last_id + 1):
+                    if task_queue[j][0] == 'store_res':
+                        task_queue[j] = ('store_res_cpu', task_queue[j][1])
+                return True
+            else:
+                tile = task[1](tile)
+            try:
+                devices.test_for_nans(tile, "vae")
+            except:
+                print(f'Nan detected in fast mode estimation. Fast mode disabled.')
+                return False
+        raise IndexError('Should not reach here')
+    @perfcount
+    @torch.no_grad()
+    def vae_tile_forward(self, z):
+        """
+        Decode a latent vector z into an image in a tiled manner.
+        @param z: latent vector
+        @return: image
+        """
+        device = next(self.net.parameters()).device
+        dtype = z.dtype
+        net = self.net
+        tile_size = self.tile_size
+        is_decoder = self.is_decoder
+        z = z.detach() # detach the input to avoid backprop
+        N, height, width = z.shape[0], z.shape[2], z.shape[3]
+        net.last_z_shape = z.shape
+        # Split the input into tiles and build a task queue for each tile
+        print(f'[Tiled VAE]: input_size: {z.shape}, tile_size: {tile_size}, padding: {self.pad}')
+        in_bboxes, out_bboxes = self.split_tiles(height, width)
+        # Prepare tiles by split the input latents
+        tiles = []
+        for input_bbox in in_bboxes:
+            tile = z[:, :, input_bbox[2]:input_bbox[3], input_bbox[0]:input_bbox[1]].cpu()
+            tiles.append(tile)
+        num_tiles = len(tiles)
+        num_completed = 0
+        # Build task queues
+        single_task_queue = build_task_queue(net, is_decoder)
+        #print(single_task_queue)
+        if self.fast_mode:
+            # Fast mode: downsample the input image to the tile size,
+            # then estimate the group norm parameters on the downsampled image
+            scale_factor = tile_size / max(height, width)
+            z = z.to(device)
+            downsampled_z = F.interpolate(z, scale_factor=scale_factor, mode='nearest-exact')
+            # use nearest-exact to keep statictics as close as possible
+            print(f'[Tiled VAE]: Fast mode enabled, estimating group norm parameters on {downsampled_z.shape[3]} x {downsampled_z.shape[2]} image')
+            # ======= Special thanks to @Kahsolt for distribution shift issue ======= #
+            # The downsampling will heavily distort its mean and std, so we need to recover it.
+            std_old, mean_old = torch.std_mean(z, dim=[0, 2, 3], keepdim=True)
+            std_new, mean_new = torch.std_mean(downsampled_z, dim=[0, 2, 3], keepdim=True)
+            downsampled_z = (downsampled_z - mean_new) / std_new * std_old + mean_old
+            del std_old, mean_old, std_new, mean_new
+            # occasionally the std_new is too small or too large, which exceeds the range of float16
+            # so we need to clamp it to max z's range.
+            downsampled_z = torch.clamp_(downsampled_z, min=z.min(), max=z.max())
+            estimate_task_queue = clone_task_queue(single_task_queue)
+            if self.estimate_group_norm(downsampled_z, estimate_task_queue, color_fix=self.color_fix):
+                single_task_queue = estimate_task_queue
+            del downsampled_z
+        task_queues = [clone_task_queue(single_task_queue) for _ in range(num_tiles)]
+        # Dummy result
+        result = None
+        result_approx = None
+        #try:
+        #    with devices.autocast():
+        #        result_approx = torch.cat([F.interpolate(cheap_approximation(x).unsqueeze(0), scale_factor=opt_f, mode='nearest-exact') for x in z], dim=0).cpu()
+        #except: pass
+        # Free memory of input latent tensor
+        del z
+        # Task queue execution
+        pbar = tqdm(total=num_tiles * len(task_queues[0]), desc=f"[Tiled VAE]: Executing {'Decoder' if is_decoder else 'Encoder'} Task Queue: ")
+        # execute the task back and forth when switch tiles so that we always
+        # keep one tile on the GPU to reduce unnecessary data transfer
+        forward = True
+        interrupted = False
+        #state.interrupted = interrupted
+        while True:
+            #if state.interrupted: interrupted = True ; break
+            group_norm_param = GroupNormParam()
+            for i in range(num_tiles) if forward else reversed(range(num_tiles)):
+                #if state.interrupted: interrupted = True ; break
+                tile = tiles[i].to(device)
+                input_bbox = in_bboxes[i]
+                task_queue = task_queues[i]
+                interrupted = False
+                while len(task_queue) > 0:
+                    #if state.interrupted: interrupted = True ; break
+                    # DEBUG: current task
+                    # print('Running task: ', task_queue[0][0], ' on tile ', i, '/', num_tiles, ' with shape ', tile.shape)
+                    task = task_queue.pop(0)
+                    if task[0] == 'pre_norm':
+                        group_norm_param.add_tile(tile, task[1])
+                        break
+                    elif task[0] == 'store_res' or task[0] == 'store_res_cpu':
+                        task_id = 0
+                        res = task[1](tile)
+                        if not self.fast_mode or task[0] == 'store_res_cpu':
+                            res = res.cpu()
+                        while task_queue[task_id][0] != 'add_res':
+                            task_id += 1
+                        task_queue[task_id][1] = res
+                    elif task[0] == 'add_res':
+                        tile += task[1].to(device)
+                        task[1] = None
+                    else:
+                        tile = task[1](tile)
+                        #print(tiles[i].shape, tile.shape, task)
+                    pbar.update(1)
+                if interrupted: break
+                # check for NaNs in the tile.
+                # If there are NaNs, we abort the process to save user's time
+                #devices.test_for_nans(tile, "vae")
+                #print(tiles[i].shape, tile.shape, i, num_tiles)
+                if len(task_queue) == 0:
+                    tiles[i] = None
+                    num_completed += 1
+                    if result is None:      # NOTE: dim C varies from different cases, can only be inited dynamically
+                        result = torch.zeros((N, tile.shape[1], height * 8 if is_decoder else height // 8, width * 8 if is_decoder else width // 8), device=device, requires_grad=False)
+                    result[:, :, out_bboxes[i][2]:out_bboxes[i][3], out_bboxes[i][0]:out_bboxes[i][1]] = crop_valid_region(tile, in_bboxes[i], out_bboxes[i], is_decoder)
+                    del tile
+                elif i == num_tiles - 1 and forward:
+                    forward = False
+                    tiles[i] = tile
+                elif i == 0 and not forward:
+                    forward = True
+                    tiles[i] = tile
+                else:
+                    tiles[i] = tile.cpu()
+                    del tile
+            if interrupted: break
+            if num_completed == num_tiles: break
+            # insert the group norm task to the head of each task queue
+            group_norm_func = group_norm_param.summary()
+            if group_norm_func is not None:
+                for i in range(num_tiles):
+                    task_queue = task_queues[i]
+                    task_queue.insert(0, ('apply_norm', group_norm_func))
+        # Done!
+        pbar.close()
+        return result.to(dtype) if result is not None else result_approx.to(device)