max commited on
Commit
89023a7
1 Parent(s): 04dbeac

added example scripts

Browse files
Files changed (3) hide show
  1. app.py +4 -0
  2. outpainting_example1.py +38 -0
  3. outpainting_example2.py +197 -0
app.py CHANGED
@@ -308,6 +308,10 @@ with gr.Blocks() as demo:
308
  # MAT Primer for Stable Diffusion
309
  ## based on MAT: Mask-Aware Transformer for Large Hole Image Inpainting
310
  ### create a primer for use in stable diffusion outpainting
 
 
 
 
311
  ''')
312
 
313
  gr.HTML(f'''<a href="{maturl}">{maturl}</a>''')
 
308
  # MAT Primer for Stable Diffusion
309
  ## based on MAT: Mask-Aware Transformer for Large Hole Image Inpainting
310
  ### create a primer for use in stable diffusion outpainting
311
+
312
+ i have added 2 example scripts to the repo:
313
+ - outpainting_example1.py using the inpainting pipeline
314
+ - outpainting_example2.py using the img2img pipeline. this is basically what i used for the examples below
315
  ''')
316
 
317
  gr.HTML(f'''<a href="{maturl}">{maturl}</a>''')
outpainting_example1.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+ # an example script of how to do outpainting with the diffusers inpainting pipeline
3
+ # this is basically just the example from
4
+ # https://huggingface.co/runwayml/stable-diffusion-inpainting
5
+ #%
6
+ from diffusers import StableDiffusionInpaintPipeline
7
+
8
+ from PIL import Image
9
+ import numpy as np
10
+ import torch
11
+
12
+ from diffusers import StableDiffusionInpaintPipeline
13
+
14
+ pipe = StableDiffusionInpaintPipeline.from_pretrained(
15
+ "runwayml/stable-diffusion-inpainting",
16
+ revision="fp16",
17
+ torch_dtype=torch.float16,
18
+ )
19
+ pipe.to("cuda")
20
+
21
+ # load the image, extract the mask
22
+ rgba = Image.open('primed_image_with_alpha_channel.png')
23
+ mask_image = Image.fromarray(np.array(rgba)[:, :, 3] == 0)
24
+
25
+ # run the pipeline
26
+ prompt = "Face of a yellow cat, high resolution, sitting on a park bench."
27
+ # image and mask_image should be PIL images.
28
+ # The mask structure is white for outpainting and black for keeping as is
29
+ image = pipe(
30
+ prompt=prompt,
31
+ image=rgba,
32
+ mask_image=mask_image,
33
+ ).images[0]
34
+ image
35
+
36
+ # %%
37
+ # the vae does lossy encoding, we could get better quality if we pasted the original image into our result.
38
+ # this may yield visible edges
outpainting_example2.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+ # an example script of how to do outpainting with diffusers img2img pipeline
3
+ # should be compatible with any stable diffusion model
4
+ # (only tested with runwayml/stable-diffusion-v1-5)
5
+
6
+ from typing import Callable, List, Optional, Union
7
+ from PIL import Image
8
+ import PIL
9
+ import numpy as np
10
+ import torch
11
+
12
+ from diffusers import StableDiffusionImg2ImgPipeline
13
+ from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
14
+ from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img import preprocess
15
+
16
+ pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
17
+ "runwayml/stable-diffusion-v1-5",
18
+ revision="fp16",
19
+ torch_dtype=torch.float16,
20
+ )
21
+
22
+ pipe.set_use_memory_efficient_attention_xformers(True)
23
+ pipe.to("cuda")
24
+ # %%
25
+ # load the image, extract the mask
26
+ rgba = Image.open('primed_image_with_alpha_channel.png')
27
+ mask_full = np.array(rgba)[:, :, 3] == 0
28
+ rgb = rgba.convert('RGB')
29
+ # %%
30
+
31
+ # resize/convert the mask to the right size
32
+ # for 512x512, the mask should be 1x4x64x64
33
+ hw = np.array(mask_full.shape)
34
+ h, w = (hw - hw % 32) // 8
35
+ mask_image = Image.fromarray(mask_full).resize((w, h), Image.NEAREST)
36
+ mask = (np.array(mask_image) == 0)[None, None]
37
+ mask = np.concatenate([mask]*4, axis=1)
38
+ mask = torch.from_numpy(mask).to('cuda')
39
+ mask.shape
40
+
41
+ # %%
42
+
43
+
44
+ @torch.no_grad()
45
+ def outpaint(
46
+ self: StableDiffusionImg2ImgPipeline,
47
+ prompt: Union[str, List[str]] = None,
48
+ image: Union[torch.FloatTensor, PIL.Image.Image] = None,
49
+ strength: float = 0.8,
50
+ num_inference_steps: Optional[int] = 50,
51
+ guidance_scale: Optional[float] = 7.5,
52
+ negative_prompt: Optional[Union[str, List[str]]] = None,
53
+ num_images_per_prompt: Optional[int] = 1,
54
+ eta: Optional[float] = 0.0,
55
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
56
+ prompt_embeds: Optional[torch.FloatTensor] = None,
57
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
58
+ output_type: Optional[str] = "pil",
59
+ return_dict: bool = True,
60
+ callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
61
+ callback_steps: Optional[int] = 1,
62
+ **kwargs,
63
+ ):
64
+ r"""
65
+ copy of the original img2img pipeline's __call__()
66
+ https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
67
+
68
+ Changes are marked with <EDIT> and </EDIT>
69
+ """
70
+ # message = "Please use `image` instead of `init_image`."
71
+ # init_image = deprecate("init_image", "0.14.0", message, take_from=kwargs)
72
+ # image = init_image or image
73
+
74
+ # 1. Check inputs. Raise error if not correct
75
+ self.check_inputs(prompt, strength, callback_steps,
76
+ negative_prompt, prompt_embeds, negative_prompt_embeds)
77
+
78
+ # 2. Define call parameters
79
+ if prompt is not None and isinstance(prompt, str):
80
+ batch_size = 1
81
+ elif prompt is not None and isinstance(prompt, list):
82
+ batch_size = len(prompt)
83
+ else:
84
+ batch_size = prompt_embeds.shape[0]
85
+ device = self._execution_device
86
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
87
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
88
+ # corresponds to doing no classifier free guidance.
89
+ do_classifier_free_guidance = guidance_scale > 1.0
90
+
91
+ # 3. Encode input prompt
92
+ prompt_embeds = self._encode_prompt(
93
+ prompt,
94
+ device,
95
+ num_images_per_prompt,
96
+ do_classifier_free_guidance,
97
+ negative_prompt,
98
+ prompt_embeds=prompt_embeds,
99
+ negative_prompt_embeds=negative_prompt_embeds,
100
+ )
101
+
102
+ # 4. Preprocess image
103
+ image = preprocess(image)
104
+
105
+ # 5. set timesteps
106
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
107
+ timesteps, num_inference_steps = self.get_timesteps(
108
+ num_inference_steps, strength, device)
109
+ latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
110
+
111
+ # 6. Prepare latent variables
112
+ latents = self.prepare_latents(
113
+ image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator
114
+ )
115
+
116
+ # <EDIT>
117
+ # store the encoded version of the original image to overwrite
118
+ # what the UNET generates "underneath" our image on each step
119
+ encoded_original = (self.vae.config.scaling_factor *
120
+ self.vae.encode(
121
+ image.to(latents.device, latents.dtype)
122
+ ).latent_dist.mean)
123
+ # </EDIT>
124
+
125
+ # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
126
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
127
+
128
+ # 8. Denoising loop
129
+ num_warmup_steps = len(timesteps) - \
130
+ num_inference_steps * self.scheduler.order
131
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
132
+ for i, t in enumerate(timesteps):
133
+ # expand the latents if we are doing classifier free guidance
134
+ latent_model_input = torch.cat(
135
+ [latents] * 2) if do_classifier_free_guidance else latents
136
+ latent_model_input = self.scheduler.scale_model_input(
137
+ latent_model_input, t)
138
+
139
+ # predict the noise residual
140
+ noise_pred = self.unet(latent_model_input, t,
141
+ encoder_hidden_states=prompt_embeds).sample
142
+
143
+ # perform guidance
144
+ if do_classifier_free_guidance:
145
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
146
+ noise_pred = noise_pred_uncond + guidance_scale * \
147
+ (noise_pred_text - noise_pred_uncond)
148
+
149
+ # compute the previous noisy sample x_t -> x_t-1
150
+ latents = self.scheduler.step(
151
+ noise_pred, t, latents, **extra_step_kwargs).prev_sample
152
+
153
+ # <EDIT> paste unmasked regions from the original image
154
+ noise = torch.randn(
155
+ encoded_original.shape, generator=generator, device=device)
156
+ noised_encoded_original = self.scheduler.add_noise(
157
+ encoded_original, noise, t).to(noise_pred.device, noise_pred.dtype)
158
+ latents[mask] = noised_encoded_original[mask]
159
+ # </EDIT>
160
+
161
+ # call the callback, if provided
162
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
163
+ progress_bar.update()
164
+ if callback is not None and i % callback_steps == 0:
165
+ callback(i, t, latents)
166
+
167
+ # 9. Post-processing
168
+ image = self.decode_latents(latents)
169
+
170
+ # 10. Run safety checker
171
+ image, has_nsfw_concept = self.run_safety_checker(
172
+ image, device, prompt_embeds.dtype)
173
+
174
+ # 11. Convert to PIL
175
+ if output_type == "pil":
176
+ image = self.numpy_to_pil(image)
177
+
178
+ if not return_dict:
179
+ return (image, has_nsfw_concept)
180
+
181
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
182
+
183
+
184
+ # %%
185
+ image = outpaint(
186
+ pipe,
187
+ image=rgb,
188
+ prompt="forest in the style of Tim Hildebrandt",
189
+ strength=0.5,
190
+ num_inference_steps=50,
191
+ guidance_scale=7.5,
192
+ ).images[0]
193
+ image
194
+
195
+ # %%
196
+ # the vae does lossy encoding, we could get better quality if we pasted the original image into our result.
197
+ # this may yield visible edges