# %% # an example script of how to do outpainting with diffusers img2img pipeline # should be compatible with any stable diffusion model # (only tested with runwayml/stable-diffusion-v1-5) from typing import Callable, List, Optional, Union from PIL import Image import PIL import numpy as np import torch from diffusers import StableDiffusionImg2ImgPipeline from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img import preprocess pipe = StableDiffusionImg2ImgPipeline.from_pretrained( "runwayml/stable-diffusion-v1-5", revision="fp16", torch_dtype=torch.float16, ) pipe.set_use_memory_efficient_attention_xformers(True) pipe.to("cuda") # %% # load the image, extract the mask rgba = Image.open('primed_image_with_alpha_channel.png') mask_full = np.array(rgba)[:, :, 3] == 0 rgb = rgba.convert('RGB') # %% # resize/convert the mask to the right size # for 512x512, the mask should be 1x4x64x64 hw = np.array(mask_full.shape) h, w = (hw - hw % 32) // 8 mask_image = Image.fromarray(mask_full).resize((w, h), Image.NEAREST) mask = (np.array(mask_image) == 0)[None, None] mask = np.concatenate([mask]*4, axis=1) mask = torch.from_numpy(mask).to('cuda') mask.shape # %% @torch.no_grad() def outpaint( self: StableDiffusionImg2ImgPipeline, prompt: Union[str, List[str]] = None, image: Union[torch.FloatTensor, PIL.Image.Image] = None, strength: float = 0.8, num_inference_steps: Optional[int] = 50, guidance_scale: Optional[float] = 7.5, negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: Optional[int] = 1, eta: Optional[float] = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback_steps: Optional[int] = 1, **kwargs, ): r""" copy of the original img2img pipeline's __call__() https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py Changes are marked with and """ # message = "Please use `image` instead of `init_image`." # init_image = deprecate("init_image", "0.14.0", message, take_from=kwargs) # image = init_image or image # 1. Check inputs. Raise error if not correct self.check_inputs(prompt, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds) # 2. Define call parameters if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): batch_size = len(prompt) else: batch_size = prompt_embeds.shape[0] device = self._execution_device # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` # corresponds to doing no classifier free guidance. do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt prompt_embeds = self._encode_prompt( prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, ) # 4. Preprocess image image = preprocess(image) # 5. set timesteps self.scheduler.set_timesteps(num_inference_steps, device=device) timesteps, num_inference_steps = self.get_timesteps( num_inference_steps, strength, device) latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) # 6. Prepare latent variables latents = self.prepare_latents( image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator ) # # store the encoded version of the original image to overwrite # what the UNET generates "underneath" our image on each step encoded_original = (self.vae.config.scaling_factor * self.vae.encode( image.to(latents.device, latents.dtype) ).latent_dist.mean) # # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 8. Denoising loop num_warmup_steps = len(timesteps) - \ num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance latent_model_input = torch.cat( [latents] * 2) if do_classifier_free_guidance else latents latent_model_input = self.scheduler.scale_model_input( latent_model_input, t) # predict the noise residual noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) noise_pred = noise_pred_uncond + guidance_scale * \ (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 latents = self.scheduler.step( noise_pred, t, latents, **extra_step_kwargs).prev_sample # paste unmasked regions from the original image noise = torch.randn( encoded_original.shape, generator=generator, device=device) noised_encoded_original = self.scheduler.add_noise( encoded_original, noise, t).to(noise_pred.device, noise_pred.dtype) latents[mask] = noised_encoded_original[mask] # # call the callback, if provided if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) # 9. Post-processing image = self.decode_latents(latents) # 10. Run safety checker image, has_nsfw_concept = self.run_safety_checker( image, device, prompt_embeds.dtype) # 11. Convert to PIL if output_type == "pil": image = self.numpy_to_pil(image) if not return_dict: return (image, has_nsfw_concept) return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) # %% image = outpaint( pipe, image=rgb, prompt="forest in the style of Tim Hildebrandt", strength=0.5, num_inference_steps=50, guidance_scale=7.5, ).images[0] image # %% # the vae does lossy encoding, we could get better quality if we pasted the original image into our result. # this may yield visible edges