AlanB
/

magic_mix_mod

Model card Files Files and versions Community

magic_mix_mod / pipeline.py

AlanB

Changed PIL Image importing method to fix error

e186a0a almost 2 years ago

raw

history blame

5.19 kB

	from typing import Union, Callable, Optional

	import torch
	import PIL
	from diffusers import (
	AutoencoderKL,
	DDIMScheduler,
	DiffusionPipeline,
	LMSDiscreteScheduler,
	PNDMScheduler,
	UNet2DConditionModel,
	)
	#from PIL import Image
	from torchvision import transforms as tfms
	from tqdm.auto import tqdm
	from transformers import CLIPTextModel, CLIPTokenizer


	class MagicMixPipeline(DiffusionPipeline):
	def __init__(
	self,
	vae: AutoencoderKL,
	text_encoder: CLIPTextModel,
	tokenizer: CLIPTokenizer,
	unet: UNet2DConditionModel,
	scheduler: Union[PNDMScheduler, LMSDiscreteScheduler, DDIMScheduler],
	):
	super().__init__()

	self.register_modules(vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet, scheduler=scheduler)

	# convert PIL image to latents
	def encode(self, img):
	with torch.no_grad():
	latent = self.vae.encode(tfms.ToTensor()(img).unsqueeze(0).to(self.device) * 2 - 1)
	latent = 0.18215 * latent.latent_dist.sample()
	return latent

	# convert latents to PIL image
	def decode(self, latent):
	latent = (1 / 0.18215) * latent
	with torch.no_grad():
	img = self.vae.decode(latent).sample
	img = (img / 2 + 0.5).clamp(0, 1)
	img = img.detach().cpu().permute(0, 2, 3, 1).numpy()
	img = (img * 255).round().astype("uint8")
	return PIL.Image.fromarray(img[0])

	# convert prompt into text embeddings, also unconditional embeddings
	def prep_text(self, prompt):
	text_input = self.tokenizer(
	prompt,
	padding="max_length",
	max_length=self.tokenizer.model_max_length,
	truncation=True,
	return_tensors="pt",
	)

	text_embedding = self.text_encoder(text_input.input_ids.to(self.device))[0]

	uncond_input = self.tokenizer(
	"",
	padding="max_length",
	max_length=self.tokenizer.model_max_length,
	truncation=True,
	return_tensors="pt",
	)

	uncond_embedding = self.text_encoder(uncond_input.input_ids.to(self.device))[0]

	return torch.cat([uncond_embedding, text_embedding])

	def __call__(
	self,
	img: PIL.Image.Image,
	prompt: str,
	kmin: float = 0.3,
	kmax: float = 0.6,
	mix_factor: float = 0.5,
	seed: int = 42,
	steps: int = 50,
	guidance_scale: float = 7.5,
	callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
	callback_steps: Optional[int] = 1,
	) -> PIL.Image.Image:
	tmin = steps - int(kmin * steps)
	tmax = steps - int(kmax * steps)

	text_embeddings = self.prep_text(prompt)

	self.scheduler.set_timesteps(steps)

	width, height = img.size
	encoded = self.encode(img)

	torch.manual_seed(seed)
	noise = torch.randn(
	(1, self.unet.in_channels, height // 8, width // 8),
	).to(self.device)

	latents = self.scheduler.add_noise(
	encoded,
	noise,
	timesteps=self.scheduler.timesteps[tmax],
	)

	input = torch.cat([latents] * 2)

	input = self.scheduler.scale_model_input(input, self.scheduler.timesteps[tmax])

	with torch.no_grad():
	pred = self.unet(
	input,
	self.scheduler.timesteps[tmax],
	encoder_hidden_states=text_embeddings,
	).sample

	pred_uncond, pred_text = pred.chunk(2)
	pred = pred_uncond + guidance_scale * (pred_text - pred_uncond)

	latents = self.scheduler.step(pred, self.scheduler.timesteps[tmax], latents).prev_sample

	for i, t in enumerate(tqdm(self.scheduler.timesteps)):
	if i > tmax:
	if i < tmin: # layout generation phase
	orig_latents = self.scheduler.add_noise(
	encoded,
	noise,
	timesteps=t,
	)

	input = (mix_factor * latents) + (
	1 - mix_factor
	) * orig_latents # interpolating between layout noise and conditionally generated noise to preserve layout sematics
	input = torch.cat([input] * 2)

	else: # content generation phase
	input = torch.cat([latents] * 2)

	input = self.scheduler.scale_model_input(input, t)

	with torch.no_grad():
	pred = self.unet(
	input,
	t,
	encoder_hidden_states=text_embeddings,
	).sample

	pred_uncond, pred_text = pred.chunk(2)
	pred = pred_uncond + guidance_scale * (pred_text - pred_uncond)

	latents = self.scheduler.step(pred, t, latents).prev_sample

	# call the callback, if provided
	if callback is not None and i % callback_steps == 0:
	callback(i, t, latents)

	return self.decode(latents)