lllyasviel commited on
Commit
59ddae4
·
1 Parent(s): 5c44c5d

SAG implemented (#88)

Browse files
fooocus_version.py CHANGED
@@ -1 +1 @@
1
- version = '1.0.27'
 
1
+ version = '1.0.28'
modules/adm_patch.py DELETED
@@ -1,33 +0,0 @@
1
- import torch
2
- import comfy.model_base
3
-
4
-
5
- def sdxl_encode_adm_patched(self, **kwargs):
6
- clip_pooled = kwargs["pooled_output"]
7
- width = kwargs.get("width", 768)
8
- height = kwargs.get("height", 768)
9
- crop_w = kwargs.get("crop_w", 0)
10
- crop_h = kwargs.get("crop_h", 0)
11
- target_width = kwargs.get("target_width", width)
12
- target_height = kwargs.get("target_height", height)
13
-
14
- if kwargs.get("prompt_type", "") == "negative":
15
- width *= 0.8
16
- height *= 0.8
17
- elif kwargs.get("prompt_type", "") == "positive":
18
- width *= 1.5
19
- height *= 1.5
20
-
21
- out = []
22
- out.append(self.embedder(torch.Tensor([height])))
23
- out.append(self.embedder(torch.Tensor([width])))
24
- out.append(self.embedder(torch.Tensor([crop_h])))
25
- out.append(self.embedder(torch.Tensor([crop_w])))
26
- out.append(self.embedder(torch.Tensor([target_height])))
27
- out.append(self.embedder(torch.Tensor([target_width])))
28
- flat = torch.flatten(torch.cat(out))[None, ]
29
- return torch.cat((clip_pooled.to(flat.device), flat), dim=1)
30
-
31
-
32
- def patch_negative_adm():
33
- comfy.model_base.SDXL.encode_adm = sdxl_encode_adm_patched
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
modules/async_worker.py CHANGED
@@ -53,8 +53,8 @@ def worker():
53
 
54
  results = []
55
  seed = image_seed
56
- if not isinstance(seed, int) or seed < 0 or seed > 65535:
57
- seed = random.randint(1, 65535)
58
 
59
  all_steps = steps * image_number
60
 
 
53
 
54
  results = []
55
  seed = image_seed
56
+ if not isinstance(seed, int) or seed < 0 or seed > 1024*1024*1024:
57
+ seed = random.randint(1, 1024*1024*1024)
58
 
59
  all_steps = steps * image_number
60
 
modules/core.py CHANGED
@@ -11,10 +11,10 @@ from comfy.sd import load_checkpoint_guess_config
11
  from nodes import VAEDecode, EmptyLatentImage, CLIPTextEncode
12
  from comfy.sample import prepare_mask, broadcast_cond, load_additional_models, cleanup_additional_models
13
  from modules.samplers_advanced import KSampler, KSamplerWithRefiner
14
- from modules.adm_patch import patch_negative_adm
15
 
16
 
17
- patch_negative_adm()
18
  opCLIPTextEncode = CLIPTextEncode()
19
  opEmptyLatentImage = EmptyLatentImage()
20
  opVAEDecode = VAEDecode()
 
11
  from nodes import VAEDecode, EmptyLatentImage, CLIPTextEncode
12
  from comfy.sample import prepare_mask, broadcast_cond, load_additional_models, cleanup_additional_models
13
  from modules.samplers_advanced import KSampler, KSamplerWithRefiner
14
+ from modules.patch import patch_all
15
 
16
 
17
+ patch_all()
18
  opCLIPTextEncode = CLIPTextEncode()
19
  opEmptyLatentImage = EmptyLatentImage()
20
  opVAEDecode = VAEDecode()
modules/filters.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ import numpy as np
5
+
6
+
7
+ def gaussian_kernel(kernel_size, sigma):
8
+ kernel = np.fromfunction(
9
+ lambda x, y: (1 / (2 * np.pi * sigma ** 2)) *
10
+ np.exp(-((x - (kernel_size - 1) / 2) ** 2 + (y - (kernel_size - 1) / 2) ** 2) / (2 * sigma ** 2)),
11
+ (kernel_size, kernel_size)
12
+ )
13
+ return kernel / np.sum(kernel)
14
+
15
+
16
+ class GaussianBlur(nn.Module):
17
+ def __init__(self, channels, kernel_size, sigma):
18
+ super(GaussianBlur, self).__init__()
19
+ self.channels = channels
20
+ self.kernel_size = kernel_size
21
+ self.sigma = sigma
22
+ self.padding = kernel_size // 2 # Ensure output size matches input size
23
+ self.register_buffer('kernel', torch.tensor(gaussian_kernel(kernel_size, sigma), dtype=torch.float32))
24
+ self.kernel = self.kernel.view(1, 1, kernel_size, kernel_size)
25
+ self.kernel = self.kernel.expand(self.channels, -1, -1, -1) # Repeat the kernel for each input channel
26
+
27
+ def forward(self, x):
28
+ x = F.conv2d(x, self.kernel.to(x), padding=self.padding, groups=self.channels)
29
+ return x
30
+
31
+
32
+ gaussian_filter_2d = GaussianBlur(4, 7, 0.8)
modules/patch.py ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import comfy.model_base
3
+ import comfy.ldm.modules.diffusionmodules.openaimodel
4
+ import comfy.samplers
5
+
6
+ from comfy.samplers import model_management, lcm, math
7
+ from comfy.ldm.modules.diffusionmodules.openaimodel import timestep_embedding, forward_timestep_embed
8
+ from modules.filters import gaussian_filter_2d
9
+
10
+
11
+ def sampling_function_patched(model_function, x, timestep, uncond, cond, cond_scale, cond_concat=None, model_options={},
12
+ seed=None):
13
+ def get_area_and_mult(cond, x_in, cond_concat_in, timestep_in):
14
+ area = (x_in.shape[2], x_in.shape[3], 0, 0)
15
+ strength = 1.0
16
+ if 'timestep_start' in cond[1]:
17
+ timestep_start = cond[1]['timestep_start']
18
+ if timestep_in[0] > timestep_start:
19
+ return None
20
+ if 'timestep_end' in cond[1]:
21
+ timestep_end = cond[1]['timestep_end']
22
+ if timestep_in[0] < timestep_end:
23
+ return None
24
+ if 'area' in cond[1]:
25
+ area = cond[1]['area']
26
+ if 'strength' in cond[1]:
27
+ strength = cond[1]['strength']
28
+
29
+ adm_cond = None
30
+ if 'adm_encoded' in cond[1]:
31
+ adm_cond = cond[1]['adm_encoded']
32
+
33
+ input_x = x_in[:, :, area[2]:area[0] + area[2], area[3]:area[1] + area[3]]
34
+ if 'mask' in cond[1]:
35
+ # Scale the mask to the size of the input
36
+ # The mask should have been resized as we began the sampling process
37
+ mask_strength = 1.0
38
+ if "mask_strength" in cond[1]:
39
+ mask_strength = cond[1]["mask_strength"]
40
+ mask = cond[1]['mask']
41
+ assert (mask.shape[1] == x_in.shape[2])
42
+ assert (mask.shape[2] == x_in.shape[3])
43
+ mask = mask[:, area[2]:area[0] + area[2], area[3]:area[1] + area[3]] * mask_strength
44
+ mask = mask.unsqueeze(1).repeat(input_x.shape[0] // mask.shape[0], input_x.shape[1], 1, 1)
45
+ else:
46
+ mask = torch.ones_like(input_x)
47
+ mult = mask * strength
48
+
49
+ if 'mask' not in cond[1]:
50
+ rr = 8
51
+ if area[2] != 0:
52
+ for t in range(rr):
53
+ mult[:, :, t:1 + t, :] *= ((1.0 / rr) * (t + 1))
54
+ if (area[0] + area[2]) < x_in.shape[2]:
55
+ for t in range(rr):
56
+ mult[:, :, area[0] - 1 - t:area[0] - t, :] *= ((1.0 / rr) * (t + 1))
57
+ if area[3] != 0:
58
+ for t in range(rr):
59
+ mult[:, :, :, t:1 + t] *= ((1.0 / rr) * (t + 1))
60
+ if (area[1] + area[3]) < x_in.shape[3]:
61
+ for t in range(rr):
62
+ mult[:, :, :, area[1] - 1 - t:area[1] - t] *= ((1.0 / rr) * (t + 1))
63
+
64
+ conditionning = {}
65
+ conditionning['c_crossattn'] = cond[0]
66
+ if cond_concat_in is not None and len(cond_concat_in) > 0:
67
+ cropped = []
68
+ for x in cond_concat_in:
69
+ cr = x[:, :, area[2]:area[0] + area[2], area[3]:area[1] + area[3]]
70
+ cropped.append(cr)
71
+ conditionning['c_concat'] = torch.cat(cropped, dim=1)
72
+
73
+ if adm_cond is not None:
74
+ conditionning['c_adm'] = adm_cond
75
+
76
+ control = None
77
+ if 'control' in cond[1]:
78
+ control = cond[1]['control']
79
+
80
+ patches = None
81
+ if 'gligen' in cond[1]:
82
+ gligen = cond[1]['gligen']
83
+ patches = {}
84
+ gligen_type = gligen[0]
85
+ gligen_model = gligen[1]
86
+ if gligen_type == "position":
87
+ gligen_patch = gligen_model.set_position(input_x.shape, gligen[2], input_x.device)
88
+ else:
89
+ gligen_patch = gligen_model.set_empty(input_x.shape, input_x.device)
90
+
91
+ patches['middle_patch'] = [gligen_patch]
92
+
93
+ return (input_x, mult, conditionning, area, control, patches)
94
+
95
+ def cond_equal_size(c1, c2):
96
+ if c1 is c2:
97
+ return True
98
+ if c1.keys() != c2.keys():
99
+ return False
100
+ if 'c_crossattn' in c1:
101
+ s1 = c1['c_crossattn'].shape
102
+ s2 = c2['c_crossattn'].shape
103
+ if s1 != s2:
104
+ if s1[0] != s2[0] or s1[2] != s2[2]: # these 2 cases should not happen
105
+ return False
106
+
107
+ mult_min = lcm(s1[1], s2[1])
108
+ diff = mult_min // min(s1[1], s2[1])
109
+ if diff > 4: # arbitrary limit on the padding because it's probably going to impact performance negatively if it's too much
110
+ return False
111
+ if 'c_concat' in c1:
112
+ if c1['c_concat'].shape != c2['c_concat'].shape:
113
+ return False
114
+ if 'c_adm' in c1:
115
+ if c1['c_adm'].shape != c2['c_adm'].shape:
116
+ return False
117
+ return True
118
+
119
+ def can_concat_cond(c1, c2):
120
+ if c1[0].shape != c2[0].shape:
121
+ return False
122
+
123
+ # control
124
+ if (c1[4] is None) != (c2[4] is None):
125
+ return False
126
+ if c1[4] is not None:
127
+ if c1[4] is not c2[4]:
128
+ return False
129
+
130
+ # patches
131
+ if (c1[5] is None) != (c2[5] is None):
132
+ return False
133
+ if (c1[5] is not None):
134
+ if c1[5] is not c2[5]:
135
+ return False
136
+
137
+ return cond_equal_size(c1[2], c2[2])
138
+
139
+ def cond_cat(c_list):
140
+ c_crossattn = []
141
+ c_concat = []
142
+ c_adm = []
143
+ crossattn_max_len = 0
144
+ for x in c_list:
145
+ if 'c_crossattn' in x:
146
+ c = x['c_crossattn']
147
+ if crossattn_max_len == 0:
148
+ crossattn_max_len = c.shape[1]
149
+ else:
150
+ crossattn_max_len = lcm(crossattn_max_len, c.shape[1])
151
+ c_crossattn.append(c)
152
+ if 'c_concat' in x:
153
+ c_concat.append(x['c_concat'])
154
+ if 'c_adm' in x:
155
+ c_adm.append(x['c_adm'])
156
+ out = {}
157
+ c_crossattn_out = []
158
+ for c in c_crossattn:
159
+ if c.shape[1] < crossattn_max_len:
160
+ c = c.repeat(1, crossattn_max_len // c.shape[1], 1) # padding with repeat doesn't change result
161
+ c_crossattn_out.append(c)
162
+
163
+ if len(c_crossattn_out) > 0:
164
+ out['c_crossattn'] = [torch.cat(c_crossattn_out)]
165
+ if len(c_concat) > 0:
166
+ out['c_concat'] = [torch.cat(c_concat)]
167
+ if len(c_adm) > 0:
168
+ out['c_adm'] = torch.cat(c_adm)
169
+ return out
170
+
171
+ def calc_cond_uncond_batch(model_function, cond, uncond, x_in, timestep, max_total_area, cond_concat_in,
172
+ model_options):
173
+ out_cond = torch.zeros_like(x_in)
174
+ out_count = torch.ones_like(x_in) / 100000.0
175
+
176
+ out_uncond = torch.zeros_like(x_in)
177
+ out_uncond_count = torch.ones_like(x_in) / 100000.0
178
+
179
+ COND = 0
180
+ UNCOND = 1
181
+
182
+ to_run = []
183
+ for x in cond:
184
+ p = get_area_and_mult(x, x_in, cond_concat_in, timestep)
185
+ if p is None:
186
+ continue
187
+
188
+ to_run += [(p, COND)]
189
+ if uncond is not None:
190
+ for x in uncond:
191
+ p = get_area_and_mult(x, x_in, cond_concat_in, timestep)
192
+ if p is None:
193
+ continue
194
+
195
+ to_run += [(p, UNCOND)]
196
+
197
+ while len(to_run) > 0:
198
+ first = to_run[0]
199
+ first_shape = first[0][0].shape
200
+ to_batch_temp = []
201
+ for x in range(len(to_run)):
202
+ if can_concat_cond(to_run[x][0], first[0]):
203
+ to_batch_temp += [x]
204
+
205
+ to_batch_temp.reverse()
206
+ to_batch = to_batch_temp[:1]
207
+
208
+ for i in range(1, len(to_batch_temp) + 1):
209
+ batch_amount = to_batch_temp[:len(to_batch_temp) // i]
210
+ if (len(batch_amount) * first_shape[0] * first_shape[2] * first_shape[3] < max_total_area):
211
+ to_batch = batch_amount
212
+ break
213
+
214
+ input_x = []
215
+ mult = []
216
+ c = []
217
+ cond_or_uncond = []
218
+ area = []
219
+ control = None
220
+ patches = None
221
+ for x in to_batch:
222
+ o = to_run.pop(x)
223
+ p = o[0]
224
+ input_x += [p[0]]
225
+ mult += [p[1]]
226
+ c += [p[2]]
227
+ area += [p[3]]
228
+ cond_or_uncond += [o[1]]
229
+ control = p[4]
230
+ patches = p[5]
231
+
232
+ batch_chunks = len(cond_or_uncond)
233
+ input_x = torch.cat(input_x)
234
+ c = cond_cat(c)
235
+ timestep_ = torch.cat([timestep] * batch_chunks)
236
+
237
+ if control is not None:
238
+ c['control'] = control.get_control(input_x, timestep_, c, len(cond_or_uncond))
239
+
240
+ transformer_options = {}
241
+ if 'transformer_options' in model_options:
242
+ transformer_options = model_options['transformer_options'].copy()
243
+
244
+ if patches is not None:
245
+ if "patches" in transformer_options:
246
+ cur_patches = transformer_options["patches"].copy()
247
+ for p in patches:
248
+ if p in cur_patches:
249
+ cur_patches[p] = cur_patches[p] + patches[p]
250
+ else:
251
+ cur_patches[p] = patches[p]
252
+ else:
253
+ transformer_options["patches"] = patches
254
+
255
+ c['transformer_options'] = transformer_options
256
+
257
+ transformer_options['uc_mask'] = torch.Tensor(cond_or_uncond).to(input_x).float()[:, None, None, None]
258
+
259
+ if 'model_function_wrapper' in model_options:
260
+ output = model_options['model_function_wrapper'](model_function,
261
+ {"input": input_x, "timestep": timestep_, "c": c,
262
+ "cond_or_uncond": cond_or_uncond}).chunk(batch_chunks)
263
+ else:
264
+ output = model_function(input_x, timestep_, **c).chunk(batch_chunks)
265
+ del input_x
266
+
267
+ model_management.throw_exception_if_processing_interrupted()
268
+
269
+ for o in range(batch_chunks):
270
+ if cond_or_uncond[o] == COND:
271
+ out_cond[:, :, area[o][2]:area[o][0] + area[o][2], area[o][3]:area[o][1] + area[o][3]] += output[
272
+ o] * \
273
+ mult[o]
274
+ out_count[:, :, area[o][2]:area[o][0] + area[o][2], area[o][3]:area[o][1] + area[o][3]] += mult[o]
275
+ else:
276
+ out_uncond[:, :, area[o][2]:area[o][0] + area[o][2], area[o][3]:area[o][1] + area[o][3]] += output[
277
+ o] * \
278
+ mult[o]
279
+ out_uncond_count[:, :, area[o][2]:area[o][0] + area[o][2], area[o][3]:area[o][1] + area[o][3]] += \
280
+ mult[o]
281
+ del mult
282
+
283
+ out_cond /= out_count
284
+ del out_count
285
+ out_uncond /= out_uncond_count
286
+ del out_uncond_count
287
+
288
+ return out_cond, out_uncond
289
+
290
+ max_total_area = model_management.maximum_batch_area()
291
+ if math.isclose(cond_scale, 1.0):
292
+ uncond = None
293
+
294
+ cond, uncond = calc_cond_uncond_batch(model_function, cond, uncond, x, timestep, max_total_area, cond_concat,
295
+ model_options)
296
+ if "sampler_cfg_function" in model_options:
297
+ args = {"cond": cond, "uncond": uncond, "cond_scale": cond_scale, "timestep": timestep}
298
+ return model_options["sampler_cfg_function"](args)
299
+ else:
300
+ return uncond + (cond - uncond) * cond_scale
301
+
302
+
303
+ def unet_forward_patched(self, x, timesteps=None, context=None, y=None, control=None, transformer_options={}, **kwargs):
304
+ uc_mask = transformer_options['uc_mask']
305
+ transformer_options["original_shape"] = list(x.shape)
306
+ transformer_options["current_index"] = 0
307
+
308
+ hs = []
309
+ t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False).to(self.dtype)
310
+ emb = self.time_embed(t_emb)
311
+
312
+ if self.num_classes is not None:
313
+ assert y.shape[0] == x.shape[0]
314
+ emb = emb + self.label_emb(y)
315
+
316
+ h = x.type(self.dtype)
317
+ for id, module in enumerate(self.input_blocks):
318
+ transformer_options["block"] = ("input", id)
319
+ h = forward_timestep_embed(module, h, emb, context, transformer_options)
320
+ if control is not None and 'input' in control and len(control['input']) > 0:
321
+ ctrl = control['input'].pop()
322
+ if ctrl is not None:
323
+ h += ctrl
324
+ hs.append(h)
325
+ transformer_options["block"] = ("middle", 0)
326
+ h = forward_timestep_embed(self.middle_block, h, emb, context, transformer_options)
327
+ if control is not None and 'middle' in control and len(control['middle']) > 0:
328
+ h += control['middle'].pop()
329
+
330
+ for id, module in enumerate(self.output_blocks):
331
+ transformer_options["block"] = ("output", id)
332
+ hsp = hs.pop()
333
+ if control is not None and 'output' in control and len(control['output']) > 0:
334
+ ctrl = control['output'].pop()
335
+ if ctrl is not None:
336
+ hsp += ctrl
337
+
338
+ h = torch.cat([h, hsp], dim=1)
339
+ del hsp
340
+ if len(hs) > 0:
341
+ output_shape = hs[-1].shape
342
+ else:
343
+ output_shape = None
344
+ h = forward_timestep_embed(module, h, emb, context, transformer_options, output_shape)
345
+ h = h.type(x.dtype)
346
+ x0 = self.out(h)
347
+
348
+ alpha = 1.0 - ((timesteps / 999.0)[:, None, None, None].clone() ** 2.0)
349
+ alpha *= 0.01
350
+ degraded_x0 = gaussian_filter_2d(x0) * alpha + x0 * (1.0 - alpha)
351
+
352
+ x0 = x0 * uc_mask + degraded_x0 * (1.0 - uc_mask)
353
+
354
+ return x0
355
+
356
+
357
+ def sdxl_encode_adm_patched(self, **kwargs):
358
+ clip_pooled = kwargs["pooled_output"]
359
+ width = kwargs.get("width", 768)
360
+ height = kwargs.get("height", 768)
361
+ crop_w = kwargs.get("crop_w", 0)
362
+ crop_h = kwargs.get("crop_h", 0)
363
+ target_width = kwargs.get("target_width", width)
364
+ target_height = kwargs.get("target_height", height)
365
+
366
+ if kwargs.get("prompt_type", "") == "negative":
367
+ width *= 0.8
368
+ height *= 0.8
369
+ elif kwargs.get("prompt_type", "") == "positive":
370
+ width *= 1.5
371
+ height *= 1.5
372
+
373
+ out = []
374
+ out.append(self.embedder(torch.Tensor([height])))
375
+ out.append(self.embedder(torch.Tensor([width])))
376
+ out.append(self.embedder(torch.Tensor([crop_h])))
377
+ out.append(self.embedder(torch.Tensor([crop_w])))
378
+ out.append(self.embedder(torch.Tensor([target_height])))
379
+ out.append(self.embedder(torch.Tensor([target_width])))
380
+ flat = torch.flatten(torch.cat(out))[None, ]
381
+ return torch.cat((clip_pooled.to(flat.device), flat), dim=1)
382
+
383
+
384
+ def patch_all():
385
+ comfy.samplers.sampling_function = sampling_function_patched
386
+ comfy.model_base.SDXL.encode_adm = sdxl_encode_adm_patched
387
+ comfy.ldm.modules.diffusionmodules.openaimodel.UNetModel.forward = unet_forward_patched
readme.md CHANGED
@@ -96,12 +96,13 @@ Note that some of these tricks are currently (2023 Aug 11) impossible to reprodu
96
 
97
  1. Native refiner swap inside one single k-sampler. The advantage is that now the refiner model can reuse the base model's momentum (or ODE's history parameters) collected from k-sampling to achieve more coherent sampling. In Automatic1111's high-res fix and ComfyUI's node system, the base model and refiner use two independent k-samplers, which means the momentum is largely wasted, and the sampling continuity is broken. Fooocus uses its own advanced k-diffusion sampling that ensures seamless, native, and continuous swap in a refiner setup. (Update Aug 13: Actually I discussed this with Automatic1111 several days ago and it seems that the “native refiner swap inside one single k-sampler” is [merged]( https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/12371) into the dev branch of webui. Great!)
98
  2. Negative ADM guidance. Because the highest resolution level of XL Base does not have cross attentions, the positive and negative signals for XL's highest resolution level cannot receive enough contrasts during the CFG sampling, causing the results look a bit plastic or overly smooth in certain cases. Fortunately, since the XL's highest resolution level is still conditioned on image aspect ratios (ADM), we can modify the adm on the positive/negative side to compensate for the lack of CFG contrast in the highest resolution level.
99
- 3. We modified the style templates a bit and added the "cinematic-default".
100
- 4. We tested the "sd_xl_offset_example-lora_1.0.safetensors" and it seems that when the lora weight is below 0.5, the results are always better than XL without lora.
101
- 5. The parameters of samplers are carefully tuned.
102
- 6. Because XL uses positional encoding for generation resolution, images generated by several fixed resolutions look a bit better than that from arbitrary resolutions (because the positional encoding is not very good at handling int numbers that are unseen during training). This suggests that the resolutions in UI may be hard coded for best results.
103
- 7. Separated prompts for two different text encoders seem unnecessary. Separated prompts for base model and refiner may work but the effects are random, and we refrain from implement this.
104
- 8. DPM family seems well-suited for XL, since XL sometimes generates overly smooth texture but DPM family sometimes generate overly dense detail in texture. Their joint effect looks neutral and appealing to human perception.
 
105
 
106
  ## Thanks
107
 
 
96
 
97
  1. Native refiner swap inside one single k-sampler. The advantage is that now the refiner model can reuse the base model's momentum (or ODE's history parameters) collected from k-sampling to achieve more coherent sampling. In Automatic1111's high-res fix and ComfyUI's node system, the base model and refiner use two independent k-samplers, which means the momentum is largely wasted, and the sampling continuity is broken. Fooocus uses its own advanced k-diffusion sampling that ensures seamless, native, and continuous swap in a refiner setup. (Update Aug 13: Actually I discussed this with Automatic1111 several days ago and it seems that the “native refiner swap inside one single k-sampler” is [merged]( https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/12371) into the dev branch of webui. Great!)
98
  2. Negative ADM guidance. Because the highest resolution level of XL Base does not have cross attentions, the positive and negative signals for XL's highest resolution level cannot receive enough contrasts during the CFG sampling, causing the results look a bit plastic or overly smooth in certain cases. Fortunately, since the XL's highest resolution level is still conditioned on image aspect ratios (ADM), we can modify the adm on the positive/negative side to compensate for the lack of CFG contrast in the highest resolution level.
99
+ 3. We implemented a carefully tuned variation of the Section 5.1 of ["Improving Sample Quality of Diffusion Models Using Self-Attention Guidance"](https://arxiv.org/pdf/2210.00939.pdf). The weight is set to very low, but this is Fooocus's final guarantee to make sure than the XL will never yield overly smooth or plastic appearance. This can almostly eliminate all cases that XL still occasionally produce overly smooth results even with negative ADM guidance.
100
+ 4. We modified the style templates a bit and added the "cinematic-default".
101
+ 5. We tested the "sd_xl_offset_example-lora_1.0.safetensors" and it seems that when the lora weight is below 0.5, the results are always better than XL without lora.
102
+ 6. The parameters of samplers are carefully tuned.
103
+ 7. Because XL uses positional encoding for generation resolution, images generated by several fixed resolutions look a bit better than that from arbitrary resolutions (because the positional encoding is not very good at handling int numbers that are unseen during training). This suggests that the resolutions in UI may be hard coded for best results.
104
+ 8. Separated prompts for two different text encoders seem unnecessary. Separated prompts for base model and refiner may work but the effects are random, and we refrain from implement this.
105
+ 9. DPM family seems well-suited for XL, since XL sometimes generates overly smooth texture but DPM family sometimes generate overly dense detail in texture. Their joint effect looks neutral and appealing to human perception.
106
 
107
  ## Thanks
108
 
update_log.md CHANGED
@@ -1,3 +1,7 @@
 
 
 
 
1
  ### 1.0.27
2
 
3
  * Fix small problem in textbox css
 
1
+ ### 1.0.28
2
+
3
+ * SAG implemented
4
+
5
  ### 1.0.27
6
 
7
  * Fix small problem in textbox css