lllyasviel commited on
Commit
124e713
1 Parent(s): bb4b7e2

Change gaussian kernel to anisotropic kernel. (#199)

Browse files
.gitignore CHANGED
@@ -2,6 +2,9 @@ __pycache__
2
  *.ckpt
3
  *.safetensors
4
  *.pth
 
 
 
5
  !taesdxl_decoder.pth
6
  /repositories
7
  /venv
 
2
  *.ckpt
3
  *.safetensors
4
  *.pth
5
+ lena.png
6
+ lena_result.png
7
+ lena_test.py
8
  !taesdxl_decoder.pth
9
  /repositories
10
  /venv
fooocus_version.py CHANGED
@@ -1 +1 @@
1
- version = '1.0.35'
 
1
+ version = '1.0.36'
modules/anisotropic.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+
4
+ Tensor = torch.Tensor
5
+ Device = torch.DeviceObjType
6
+ Dtype = torch.Type
7
+ pad = torch.nn.functional.pad
8
+
9
+
10
+ def _compute_zero_padding(kernel_size: tuple[int, int] | int) -> tuple[int, int]:
11
+ ky, kx = _unpack_2d_ks(kernel_size)
12
+ return (ky - 1) // 2, (kx - 1) // 2
13
+
14
+
15
+ def _unpack_2d_ks(kernel_size: tuple[int, int] | int) -> tuple[int, int]:
16
+ if isinstance(kernel_size, int):
17
+ ky = kx = kernel_size
18
+ else:
19
+ assert len(kernel_size) == 2, '2D Kernel size should have a length of 2.'
20
+ ky, kx = kernel_size
21
+
22
+ ky = int(ky)
23
+ kx = int(kx)
24
+ return ky, kx
25
+
26
+
27
+ def gaussian(
28
+ window_size: int, sigma: Tensor | float, *, device: Device | None = None, dtype: Dtype | None = None
29
+ ) -> Tensor:
30
+
31
+ batch_size = sigma.shape[0]
32
+
33
+ x = (torch.arange(window_size, device=sigma.device, dtype=sigma.dtype) - window_size // 2).expand(batch_size, -1)
34
+
35
+ if window_size % 2 == 0:
36
+ x = x + 0.5
37
+
38
+ gauss = torch.exp(-x.pow(2.0) / (2 * sigma.pow(2.0)))
39
+
40
+ return gauss / gauss.sum(-1, keepdim=True)
41
+
42
+
43
+ def get_gaussian_kernel1d(
44
+ kernel_size: int,
45
+ sigma: float | Tensor,
46
+ force_even: bool = False,
47
+ *,
48
+ device: Device | None = None,
49
+ dtype: Dtype | None = None,
50
+ ) -> Tensor:
51
+
52
+ return gaussian(kernel_size, sigma, device=device, dtype=dtype)
53
+
54
+
55
+ def get_gaussian_kernel2d(
56
+ kernel_size: tuple[int, int] | int,
57
+ sigma: tuple[float, float] | Tensor,
58
+ force_even: bool = False,
59
+ *,
60
+ device: Device | None = None,
61
+ dtype: Dtype | None = None,
62
+ ) -> Tensor:
63
+
64
+ sigma = torch.Tensor([[sigma, sigma]]).to(device=device, dtype=dtype)
65
+
66
+ ksize_y, ksize_x = _unpack_2d_ks(kernel_size)
67
+ sigma_y, sigma_x = sigma[:, 0, None], sigma[:, 1, None]
68
+
69
+ kernel_y = get_gaussian_kernel1d(ksize_y, sigma_y, force_even, device=device, dtype=dtype)[..., None]
70
+ kernel_x = get_gaussian_kernel1d(ksize_x, sigma_x, force_even, device=device, dtype=dtype)[..., None]
71
+
72
+ return kernel_y * kernel_x.view(-1, 1, ksize_x)
73
+
74
+
75
+ def _bilateral_blur(
76
+ input: Tensor,
77
+ guidance: Tensor | None,
78
+ kernel_size: tuple[int, int] | int,
79
+ sigma_color: float | Tensor,
80
+ sigma_space: tuple[float, float] | Tensor,
81
+ border_type: str = 'reflect',
82
+ color_distance_type: str = 'l1',
83
+ ) -> Tensor:
84
+
85
+ if isinstance(sigma_color, Tensor):
86
+ sigma_color = sigma_color.to(device=input.device, dtype=input.dtype).view(-1, 1, 1, 1, 1)
87
+
88
+ ky, kx = _unpack_2d_ks(kernel_size)
89
+ pad_y, pad_x = _compute_zero_padding(kernel_size)
90
+
91
+ padded_input = pad(input, (pad_x, pad_x, pad_y, pad_y), mode=border_type)
92
+ unfolded_input = padded_input.unfold(2, ky, 1).unfold(3, kx, 1).flatten(-2) # (B, C, H, W, Ky x Kx)
93
+
94
+ if guidance is None:
95
+ guidance = input
96
+ unfolded_guidance = unfolded_input
97
+ else:
98
+ padded_guidance = pad(guidance, (pad_x, pad_x, pad_y, pad_y), mode=border_type)
99
+ unfolded_guidance = padded_guidance.unfold(2, ky, 1).unfold(3, kx, 1).flatten(-2) # (B, C, H, W, Ky x Kx)
100
+
101
+ diff = unfolded_guidance - guidance.unsqueeze(-1)
102
+ if color_distance_type == "l1":
103
+ color_distance_sq = diff.abs().sum(1, keepdim=True).square()
104
+ elif color_distance_type == "l2":
105
+ color_distance_sq = diff.square().sum(1, keepdim=True)
106
+ else:
107
+ raise ValueError("color_distance_type only acceps l1 or l2")
108
+ color_kernel = (-0.5 / sigma_color**2 * color_distance_sq).exp() # (B, 1, H, W, Ky x Kx)
109
+
110
+ space_kernel = get_gaussian_kernel2d(kernel_size, sigma_space, device=input.device, dtype=input.dtype)
111
+ space_kernel = space_kernel.view(-1, 1, 1, 1, kx * ky)
112
+
113
+ kernel = space_kernel * color_kernel
114
+ out = (unfolded_input * kernel).sum(-1) / kernel.sum(-1)
115
+ return out
116
+
117
+
118
+ def bilateral_blur(
119
+ input: Tensor,
120
+ kernel_size: tuple[int, int] | int = (13, 13),
121
+ sigma_color: float | Tensor = 3.0,
122
+ sigma_space: tuple[float, float] | Tensor = 3.0,
123
+ border_type: str = 'reflect',
124
+ color_distance_type: str = 'l1',
125
+ ) -> Tensor:
126
+ return _bilateral_blur(input, None, kernel_size, sigma_color, sigma_space, border_type, color_distance_type)
127
+
128
+
129
+ def joint_bilateral_blur(
130
+ input: Tensor,
131
+ guidance: Tensor,
132
+ kernel_size: tuple[int, int] | int,
133
+ sigma_color: float | Tensor,
134
+ sigma_space: tuple[float, float] | Tensor,
135
+ border_type: str = 'reflect',
136
+ color_distance_type: str = 'l1',
137
+ ) -> Tensor:
138
+ return _bilateral_blur(input, guidance, kernel_size, sigma_color, sigma_space, border_type, color_distance_type)
139
+
140
+
141
+ class _BilateralBlur(torch.nn.Module):
142
+ def __init__(
143
+ self,
144
+ kernel_size: tuple[int, int] | int,
145
+ sigma_color: float | Tensor,
146
+ sigma_space: tuple[float, float] | Tensor,
147
+ border_type: str = 'reflect',
148
+ color_distance_type: str = "l1",
149
+ ) -> None:
150
+ super().__init__()
151
+ self.kernel_size = kernel_size
152
+ self.sigma_color = sigma_color
153
+ self.sigma_space = sigma_space
154
+ self.border_type = border_type
155
+ self.color_distance_type = color_distance_type
156
+
157
+ def __repr__(self) -> str:
158
+ return (
159
+ f"{self.__class__.__name__}"
160
+ f"(kernel_size={self.kernel_size}, "
161
+ f"sigma_color={self.sigma_color}, "
162
+ f"sigma_space={self.sigma_space}, "
163
+ f"border_type={self.border_type}, "
164
+ f"color_distance_type={self.color_distance_type})"
165
+ )
166
+
167
+
168
+ class BilateralBlur(_BilateralBlur):
169
+ def forward(self, input: Tensor) -> Tensor:
170
+ return bilateral_blur(
171
+ input, self.kernel_size, self.sigma_color, self.sigma_space, self.border_type, self.color_distance_type
172
+ )
173
+
174
+
175
+ class JointBilateralBlur(_BilateralBlur):
176
+ def forward(self, input: Tensor, guidance: Tensor) -> Tensor:
177
+ return joint_bilateral_blur(
178
+ input,
179
+ guidance,
180
+ self.kernel_size,
181
+ self.sigma_color,
182
+ self.sigma_space,
183
+ self.border_type,
184
+ self.color_distance_type,
185
+ )
modules/filters.py DELETED
@@ -1,32 +0,0 @@
1
- import torch
2
- import torch.nn as nn
3
- import torch.nn.functional as F
4
- import numpy as np
5
-
6
-
7
- def gaussian_kernel(kernel_size, sigma):
8
- kernel = np.fromfunction(
9
- lambda x, y: (1 / (2 * np.pi * sigma ** 2)) *
10
- np.exp(-((x - (kernel_size - 1) / 2) ** 2 + (y - (kernel_size - 1) / 2) ** 2) / (2 * sigma ** 2)),
11
- (kernel_size, kernel_size)
12
- )
13
- return kernel / np.sum(kernel)
14
-
15
-
16
- class GaussianBlur(nn.Module):
17
- def __init__(self, channels, kernel_size, sigma):
18
- super(GaussianBlur, self).__init__()
19
- self.channels = channels
20
- self.kernel_size = kernel_size
21
- self.sigma = sigma
22
- self.padding = kernel_size // 2 # Ensure output size matches input size
23
- self.register_buffer('kernel', torch.tensor(gaussian_kernel(kernel_size, sigma), dtype=torch.float32))
24
- self.kernel = self.kernel.view(1, 1, kernel_size, kernel_size)
25
- self.kernel = self.kernel.expand(self.channels, -1, -1, -1) # Repeat the kernel for each input channel
26
-
27
- def forward(self, x):
28
- x = F.conv2d(x, self.kernel.to(x), padding=self.padding, groups=self.channels)
29
- return x
30
-
31
-
32
- gaussian_filter_2d = GaussianBlur(4, 7, 0.8)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
modules/patch.py CHANGED
@@ -2,10 +2,11 @@ import torch
2
  import comfy.model_base
3
  import comfy.ldm.modules.diffusionmodules.openaimodel
4
  import comfy.samplers
 
5
 
6
  from comfy.samplers import model_management, lcm, math
7
  from comfy.ldm.modules.diffusionmodules.openaimodel import timestep_embedding, forward_timestep_embed
8
- from modules.filters import gaussian_filter_2d
9
 
10
  sharpness = 2.0
11
 
@@ -349,7 +350,7 @@ def unet_forward_patched(self, x, timesteps=None, context=None, y=None, control=
349
 
350
  alpha = 1.0 - (timesteps / 999.0)[:, None, None, None].clone()
351
  alpha *= 0.001 * sharpness
352
- degraded_x0 = gaussian_filter_2d(x0) * alpha + x0 * (1.0 - alpha)
353
 
354
  x0 = x0 * uc_mask + degraded_x0 * (1.0 - uc_mask)
355
 
 
2
  import comfy.model_base
3
  import comfy.ldm.modules.diffusionmodules.openaimodel
4
  import comfy.samplers
5
+ import modules.anisotropic as anisotropic
6
 
7
  from comfy.samplers import model_management, lcm, math
8
  from comfy.ldm.modules.diffusionmodules.openaimodel import timestep_embedding, forward_timestep_embed
9
+
10
 
11
  sharpness = 2.0
12
 
 
350
 
351
  alpha = 1.0 - (timesteps / 999.0)[:, None, None, None].clone()
352
  alpha *= 0.001 * sharpness
353
+ degraded_x0 = anisotropic.bilateral_blur(x0) * alpha + x0 * (1.0 - alpha)
354
 
355
  x0 = x0 * uc_mask + degraded_x0 * (1.0 - uc_mask)
356
 
readme.md CHANGED
@@ -97,7 +97,7 @@ Note that some of these tricks are currently (2023 Aug 11) impossible to reprodu
97
 
98
  1. Native refiner swap inside one single k-sampler. The advantage is that now the refiner model can reuse the base model's momentum (or ODE's history parameters) collected from k-sampling to achieve more coherent sampling. In Automatic1111's high-res fix and ComfyUI's node system, the base model and refiner use two independent k-samplers, which means the momentum is largely wasted, and the sampling continuity is broken. Fooocus uses its own advanced k-diffusion sampling that ensures seamless, native, and continuous swap in a refiner setup. (Update Aug 13: Actually I discussed this with Automatic1111 several days ago and it seems that the “native refiner swap inside one single k-sampler” is [merged]( https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/12371) into the dev branch of webui. Great!)
99
  2. Negative ADM guidance. Because the highest resolution level of XL Base does not have cross attentions, the positive and negative signals for XL's highest resolution level cannot receive enough contrasts during the CFG sampling, causing the results look a bit plastic or overly smooth in certain cases. Fortunately, since the XL's highest resolution level is still conditioned on image aspect ratios (ADM), we can modify the adm on the positive/negative side to compensate for the lack of CFG contrast in the highest resolution level. (Update Aug 16, the IOS App [Drawing Things](https://apps.apple.com/us/app/draw-things-ai-generation/id6444050820) will support Negative ADM Guidance. Great!)
100
- 3. We implemented a carefully tuned variation of the Section 5.1 of ["Improving Sample Quality of Diffusion Models Using Self-Attention Guidance"](https://arxiv.org/pdf/2210.00939.pdf). The weight is set to very low, but this is Fooocus's final guarantee to make sure that the XL will never yield overly smooth or plastic appearance. This can almostly eliminate all cases that XL still occasionally produce overly smooth results even with negative ADM guidance.
101
  4. We modified the style templates a bit and added the "cinematic-default".
102
  5. We tested the "sd_xl_offset_example-lora_1.0.safetensors" and it seems that when the lora weight is below 0.5, the results are always better than XL without lora.
103
  6. The parameters of samplers are carefully tuned.
 
97
 
98
  1. Native refiner swap inside one single k-sampler. The advantage is that now the refiner model can reuse the base model's momentum (or ODE's history parameters) collected from k-sampling to achieve more coherent sampling. In Automatic1111's high-res fix and ComfyUI's node system, the base model and refiner use two independent k-samplers, which means the momentum is largely wasted, and the sampling continuity is broken. Fooocus uses its own advanced k-diffusion sampling that ensures seamless, native, and continuous swap in a refiner setup. (Update Aug 13: Actually I discussed this with Automatic1111 several days ago and it seems that the “native refiner swap inside one single k-sampler” is [merged]( https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/12371) into the dev branch of webui. Great!)
99
  2. Negative ADM guidance. Because the highest resolution level of XL Base does not have cross attentions, the positive and negative signals for XL's highest resolution level cannot receive enough contrasts during the CFG sampling, causing the results look a bit plastic or overly smooth in certain cases. Fortunately, since the XL's highest resolution level is still conditioned on image aspect ratios (ADM), we can modify the adm on the positive/negative side to compensate for the lack of CFG contrast in the highest resolution level. (Update Aug 16, the IOS App [Drawing Things](https://apps.apple.com/us/app/draw-things-ai-generation/id6444050820) will support Negative ADM Guidance. Great!)
100
+ 3. We implemented a carefully tuned variation of the Section 5.1 of ["Improving Sample Quality of Diffusion Models Using Self-Attention Guidance"](https://arxiv.org/pdf/2210.00939.pdf). The weight is set to very low, but this is Fooocus's final guarantee to make sure that the XL will never yield overly smooth or plastic appearance. This can almostly eliminate all cases that XL still occasionally produce overly smooth results even with negative ADM guidance. (Update 2023 Aug 18, the Gaussian kernel of SAM is changed to an anisotropic kernel for better structure preservation and fewer artifacts.)
101
  4. We modified the style templates a bit and added the "cinematic-default".
102
  5. We tested the "sd_xl_offset_example-lora_1.0.safetensors" and it seems that when the lora weight is below 0.5, the results are always better than XL without lora.
103
  6. The parameters of samplers are carefully tuned.
update_log.md CHANGED
@@ -1,3 +1,7 @@
 
 
 
 
1
  ### 1.0.34
2
 
3
  * Random seed restoring.
 
1
+ ### 1.0.36
2
+
3
+ * Change gaussian kernel to anisotropic kernel.
4
+
5
  ### 1.0.34
6
 
7
  * Random seed restoring.