auto

8d7ec14 over 1 year ago

7.55 kB

	import torch

	from modules_forge.supported_preprocessor import Preprocessor, PreprocessorParameter
	from modules_forge.shared import add_supported_preprocessor
	from ldm_patched.modules.samplers import sampling_function
	import ldm_patched.ldm.modules.attention as attention


	def sdp(q, k, v, transformer_options):
	if q.shape[0] == 0:
	return q

	return attention.optimized_attention(q, k, v, heads=transformer_options["n_heads"], mask=None)


	def adain(x, target_std, target_mean):
	if x.shape[0] == 0:
	return x

	std, mean = torch.std_mean(x, dim=(2, 3), keepdim=True, correction=0)
	return (((x - mean) / std) * target_std) + target_mean


	def zero_cat(a, b, dim):
	if a.shape[0] == 0:
	return b
	if b.shape[0] == 0:
	return a
	return torch.cat([a, b], dim=dim)


	class PreprocessorReference(Preprocessor):
	def __init__(self, name, use_attn=True, use_adain=True, priority=0):
	super().__init__()
	self.name = name
	self.use_attn = use_attn
	self.use_adain = use_adain
	self.sorting_priority = priority
	self.tags = ['Reference']
	self.slider_resolution = PreprocessorParameter(visible=False)
	self.slider_1 = PreprocessorParameter(label='Style Fidelity', value=0.5, minimum=0.0, maximum=1.0, step=0.01, visible=True)
	self.show_control_mode = False
	self.corp_image_with_a1111_mask_when_in_img2img_inpaint_tab = False
	self.do_not_need_model = True

	self.is_recording_style = False
	self.recorded_attn1 = {}
	self.recorded_h = {}

	def process_before_every_sampling(self, process, cond, mask, args, *kwargs):
	unit = kwargs['unit']
	weight = float(unit.weight)
	style_fidelity = float(unit.threshold_a)
	start_percent = float(unit.guidance_start)
	end_percent = float(unit.guidance_end)

	if process.sd_model.is_sdxl:
	style_fidelity = style_fidelity ** 3.0 # sdxl is very sensitive to reference so we lower the weights

	vae = process.sd_model.forge_objects.vae
	# This is a powerful VAE with integrated memory management, bf16, and tiled fallback.

	latent_image = vae.encode(cond.movedim(1, -1))
	latent_image = process.sd_model.forge_objects.unet.model.latent_format.process_in(latent_image)

	gen_seed = process.seeds[0] + 1
	gen_cpu = torch.Generator().manual_seed(gen_seed)

	unet = process.sd_model.forge_objects.unet.clone()
	sigma_max = unet.model.model_sampling.percent_to_sigma(start_percent)
	sigma_min = unet.model.model_sampling.percent_to_sigma(end_percent)

	self.recorded_attn1 = {}
	self.recorded_h = {}

	def conditioning_modifier(model, x, timestep, uncond, cond, cond_scale, model_options, seed):
	sigma = timestep[0].item()
	if not (sigma_min <= sigma <= sigma_max):
	return model, x, timestep, uncond, cond, cond_scale, model_options, seed

	self.is_recording_style = True

	xt = latent_image.to(x) + torch.randn(x.size(), dtype=x.dtype, generator=gen_cpu).to(x) * sigma
	sampling_function(model, xt, timestep, uncond, cond, 1, model_options, seed)

	self.is_recording_style = False

	return model, x, timestep, uncond, cond, cond_scale, model_options, seed

	def block_proc(h, flag, transformer_options):
	if not self.use_adain:
	return h

	if flag != 'after':
	return h

	location = transformer_options['block']

	sigma = transformer_options["sigmas"][0].item()
	if not (sigma_min <= sigma <= sigma_max):
	return h

	channel = int(h.shape[1])
	minimal_channel = 1500 - 1000 * weight

	if channel < minimal_channel:
	return h

	if self.is_recording_style:
	self.recorded_h[location] = torch.std_mean(h, dim=(2, 3), keepdim=True, correction=0)
	return h
	else:
	cond_indices = transformer_options['cond_indices']
	uncond_indices = transformer_options['uncond_indices']
	cond_or_uncond = transformer_options['cond_or_uncond']
	r_std, r_mean = self.recorded_h[location]

	h_c = h[cond_indices]
	h_uc = h[uncond_indices]

	o_c = adain(h_c, r_std, r_mean)
	o_uc_strong = h_uc
	o_uc_weak = adain(h_uc, r_std, r_mean)
	o_uc = o_uc_weak + (o_uc_strong - o_uc_weak) * style_fidelity

	recon = []
	for cx in cond_or_uncond:
	if cx == 0:
	recon.append(o_c)
	else:
	recon.append(o_uc)

	o = torch.cat(recon, dim=0)
	return o

	def attn1_proc(q, k, v, transformer_options):
	if not self.use_attn:
	return sdp(q, k, v, transformer_options)

	sigma = transformer_options["sigmas"][0].item()
	if not (sigma_min <= sigma <= sigma_max):
	return sdp(q, k, v, transformer_options)

	location = (transformer_options['block'][0], transformer_options['block'][1],
	transformer_options['block_index'])

	channel = int(q.shape[2])
	minimal_channel = 1500 - 1280 * weight

	if channel < minimal_channel:
	return sdp(q, k, v, transformer_options)

	if self.is_recording_style:
	self.recorded_attn1[location] = (k, v)
	return sdp(q, k, v, transformer_options)
	else:
	cond_indices = transformer_options['cond_indices']
	uncond_indices = transformer_options['uncond_indices']
	cond_or_uncond = transformer_options['cond_or_uncond']

	q_c = q[cond_indices]
	q_uc = q[uncond_indices]

	k_c = k[cond_indices]
	k_uc = k[uncond_indices]

	v_c = v[cond_indices]
	v_uc = v[uncond_indices]

	k_r, v_r = self.recorded_attn1[location]

	o_c = sdp(q_c, zero_cat(k_c, k_r, dim=1), zero_cat(v_c, v_r, dim=1), transformer_options)
	o_uc_strong = sdp(q_uc, k_uc, v_uc, transformer_options)
	o_uc_weak = sdp(q_uc, zero_cat(k_uc, k_r, dim=1), zero_cat(v_uc, v_r, dim=1), transformer_options)
	o_uc = o_uc_weak + (o_uc_strong - o_uc_weak) * style_fidelity

	recon = []
	for cx in cond_or_uncond:
	if cx == 0:
	recon.append(o_c)
	else:
	recon.append(o_uc)

	o = torch.cat(recon, dim=0)
	return o

	unet.add_block_modifier(block_proc)
	unet.add_conditioning_modifier(conditioning_modifier)
	unet.set_model_replace_all(attn1_proc, 'attn1')

	process.sd_model.forge_objects.unet = unet

	return cond, mask


	add_supported_preprocessor(PreprocessorReference(
	name='reference_only',
	use_attn=True,
	use_adain=False,
	priority=100
	))

	add_supported_preprocessor(PreprocessorReference(
	name='reference_adain',
	use_attn=False,
	use_adain=True
	))

	add_supported_preprocessor(PreprocessorReference(
	name='reference_adain+attn',
	use_attn=True,
	use_adain=True
	))