OzzyGT HF staff commited on
Commit
0f7a0a6
1 Parent(s): 613da31

Upload pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +474 -0
pipeline.py ADDED
@@ -0,0 +1,474 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import List, Optional, Union
16
+
17
+ import PIL.Image
18
+ import torch
19
+ import torch.nn.functional as F
20
+ from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
21
+
22
+ from controlnet_union import ControlNetModel_Union
23
+ from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
24
+ from diffusers.models import AutoencoderKL, UNet2DConditionModel
25
+ from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
26
+ from diffusers.schedulers import KarrasDiffusionSchedulers
27
+ from diffusers.utils.torch_utils import randn_tensor
28
+
29
+
30
+ def retrieve_timesteps(
31
+ scheduler,
32
+ num_inference_steps: Optional[int] = None,
33
+ device: Optional[Union[str, torch.device]] = None,
34
+ **kwargs,
35
+ ):
36
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
37
+ timesteps = scheduler.timesteps
38
+
39
+ return timesteps, num_inference_steps
40
+
41
+
42
+ class StableDiffusionXLPipeline(DiffusionPipeline, StableDiffusionMixin):
43
+ model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
44
+ _optional_components = [
45
+ "tokenizer",
46
+ "tokenizer_2",
47
+ "text_encoder",
48
+ "text_encoder_2",
49
+ ]
50
+
51
+ def __init__(
52
+ self,
53
+ vae: AutoencoderKL,
54
+ text_encoder: CLIPTextModel,
55
+ text_encoder_2: CLIPTextModelWithProjection,
56
+ tokenizer: CLIPTokenizer,
57
+ tokenizer_2: CLIPTokenizer,
58
+ unet: UNet2DConditionModel,
59
+ controlnet: ControlNetModel_Union,
60
+ scheduler: KarrasDiffusionSchedulers,
61
+ force_zeros_for_empty_prompt: bool = True,
62
+ ):
63
+ super().__init__()
64
+
65
+ self.register_modules(
66
+ vae=vae,
67
+ text_encoder=text_encoder,
68
+ text_encoder_2=text_encoder_2,
69
+ tokenizer=tokenizer,
70
+ tokenizer_2=tokenizer_2,
71
+ unet=unet,
72
+ controlnet=controlnet,
73
+ scheduler=scheduler,
74
+ )
75
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
76
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
77
+ self.control_image_processor = VaeImageProcessor(
78
+ vae_scale_factor=self.vae_scale_factor,
79
+ do_convert_rgb=True,
80
+ do_normalize=False,
81
+ )
82
+
83
+ self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
84
+
85
+ def encode_prompt(
86
+ self,
87
+ prompt: str,
88
+ device: Optional[torch.device] = None,
89
+ do_classifier_free_guidance: bool = True,
90
+ ):
91
+ device = device or self._execution_device
92
+ prompt = [prompt] if isinstance(prompt, str) else prompt
93
+
94
+ if prompt is not None:
95
+ batch_size = len(prompt)
96
+
97
+ # Define tokenizers and text encoders
98
+ tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
99
+ text_encoders = (
100
+ [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
101
+ )
102
+
103
+ prompt_2 = prompt
104
+ prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
105
+
106
+ # textual inversion: process multi-vector tokens if necessary
107
+ prompt_embeds_list = []
108
+ prompts = [prompt, prompt_2]
109
+ for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
110
+ text_inputs = tokenizer(
111
+ prompt,
112
+ padding="max_length",
113
+ max_length=tokenizer.model_max_length,
114
+ truncation=True,
115
+ return_tensors="pt",
116
+ )
117
+
118
+ text_input_ids = text_inputs.input_ids
119
+
120
+ prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
121
+
122
+ # We are only ALWAYS interested in the pooled output of the final text encoder
123
+ pooled_prompt_embeds = prompt_embeds[0]
124
+ prompt_embeds = prompt_embeds.hidden_states[-2]
125
+ prompt_embeds_list.append(prompt_embeds)
126
+
127
+ prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
128
+
129
+ # get unconditional embeddings for classifier free guidance
130
+ zero_out_negative_prompt = True
131
+ negative_prompt_embeds = None
132
+ negative_pooled_prompt_embeds = None
133
+
134
+ if do_classifier_free_guidance and zero_out_negative_prompt:
135
+ negative_prompt_embeds = torch.zeros_like(prompt_embeds)
136
+ negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
137
+ elif do_classifier_free_guidance and negative_prompt_embeds is None:
138
+ negative_prompt = ""
139
+ negative_prompt_2 = negative_prompt
140
+
141
+ # normalize str to list
142
+ negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
143
+ negative_prompt_2 = (
144
+ batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
145
+ )
146
+
147
+ uncond_tokens: List[str]
148
+ if prompt is not None and type(prompt) is not type(negative_prompt):
149
+ raise TypeError(
150
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
151
+ f" {type(prompt)}."
152
+ )
153
+ elif batch_size != len(negative_prompt):
154
+ raise ValueError(
155
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
156
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
157
+ " the batch size of `prompt`."
158
+ )
159
+ else:
160
+ uncond_tokens = [negative_prompt, negative_prompt_2]
161
+
162
+ negative_prompt_embeds_list = []
163
+ for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
164
+ max_length = prompt_embeds.shape[1]
165
+ uncond_input = tokenizer(
166
+ negative_prompt,
167
+ padding="max_length",
168
+ max_length=max_length,
169
+ truncation=True,
170
+ return_tensors="pt",
171
+ )
172
+
173
+ negative_prompt_embeds = text_encoder(
174
+ uncond_input.input_ids.to(device),
175
+ output_hidden_states=True,
176
+ )
177
+ # We are only ALWAYS interested in the pooled output of the final text encoder
178
+ negative_pooled_prompt_embeds = negative_prompt_embeds[0]
179
+ negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
180
+
181
+ negative_prompt_embeds_list.append(negative_prompt_embeds)
182
+
183
+ negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
184
+
185
+ prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
186
+
187
+ bs_embed, seq_len, _ = prompt_embeds.shape
188
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
189
+ prompt_embeds = prompt_embeds.repeat(1, 1, 1)
190
+ prompt_embeds = prompt_embeds.view(bs_embed * 1, seq_len, -1)
191
+
192
+ if do_classifier_free_guidance:
193
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
194
+ seq_len = negative_prompt_embeds.shape[1]
195
+
196
+ if self.text_encoder_2 is not None:
197
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
198
+ else:
199
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
200
+
201
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, 1, 1)
202
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * 1, seq_len, -1)
203
+
204
+ pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, 1).view(bs_embed * 1, -1)
205
+ if do_classifier_free_guidance:
206
+ negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, 1).view(bs_embed * 1, -1)
207
+
208
+ return (
209
+ prompt_embeds,
210
+ negative_prompt_embeds,
211
+ pooled_prompt_embeds,
212
+ negative_pooled_prompt_embeds,
213
+ )
214
+
215
+ def check_inputs(
216
+ self,
217
+ prompt_embeds,
218
+ negative_prompt_embeds,
219
+ pooled_prompt_embeds,
220
+ negative_pooled_prompt_embeds,
221
+ image,
222
+ controlnet_conditioning_scale=1.0,
223
+ ):
224
+ if prompt_embeds is None:
225
+ raise ValueError("Provide `prompt_embeds`. Cannot leave `prompt_embeds` undefined.")
226
+
227
+ if negative_prompt_embeds is None:
228
+ raise ValueError("Provide `negative_prompt_embeds`. Cannot leave `negative_prompt_embeds` undefined.")
229
+
230
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
231
+ raise ValueError(
232
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
233
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
234
+ f" {negative_prompt_embeds.shape}."
235
+ )
236
+
237
+ if prompt_embeds is not None and pooled_prompt_embeds is None:
238
+ raise ValueError(
239
+ "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
240
+ )
241
+
242
+ if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
243
+ raise ValueError(
244
+ "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
245
+ )
246
+
247
+ # Check `image`
248
+ is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
249
+ self.controlnet, torch._dynamo.eval_frame.OptimizedModule
250
+ )
251
+ if (
252
+ isinstance(self.controlnet, ControlNetModel_Union)
253
+ or is_compiled
254
+ and isinstance(self.controlnet._orig_mod, ControlNetModel_Union)
255
+ ):
256
+ if not isinstance(image, PIL.Image.Image):
257
+ raise TypeError(f"image must be passed and has to be a PIL image, but is {type(image)}")
258
+
259
+ else:
260
+ assert False
261
+
262
+ # Check `controlnet_conditioning_scale`
263
+ if (
264
+ isinstance(self.controlnet, ControlNetModel_Union)
265
+ or is_compiled
266
+ and isinstance(self.controlnet._orig_mod, ControlNetModel_Union)
267
+ ):
268
+ if not isinstance(controlnet_conditioning_scale, float):
269
+ raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
270
+ else:
271
+ assert False
272
+
273
+ def prepare_image(self, image, device, dtype, do_classifier_free_guidance=False):
274
+ image = self.control_image_processor.preprocess(image).to(dtype=torch.float32)
275
+
276
+ image_batch_size = image.shape[0]
277
+
278
+ image = image.repeat_interleave(image_batch_size, dim=0)
279
+ image = image.to(device=device, dtype=dtype)
280
+
281
+ if do_classifier_free_guidance:
282
+ image = torch.cat([image] * 2)
283
+
284
+ return image
285
+
286
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device):
287
+ shape = (
288
+ batch_size,
289
+ num_channels_latents,
290
+ int(height) // self.vae_scale_factor,
291
+ int(width) // self.vae_scale_factor,
292
+ )
293
+
294
+ latents = randn_tensor(shape, device=device, dtype=dtype)
295
+
296
+ # scale the initial noise by the standard deviation required by the scheduler
297
+ latents = latents * self.scheduler.init_noise_sigma
298
+ return latents
299
+
300
+ @property
301
+ def guidance_scale(self):
302
+ return self._guidance_scale
303
+
304
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
305
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
306
+ # corresponds to doing no classifier free guidance.
307
+ @property
308
+ def do_classifier_free_guidance(self):
309
+ return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
310
+
311
+ @property
312
+ def num_timesteps(self):
313
+ return self._num_timesteps
314
+
315
+ @torch.no_grad()
316
+ def __call__(
317
+ self,
318
+ prompt_embeds: torch.Tensor,
319
+ negative_prompt_embeds: torch.Tensor,
320
+ pooled_prompt_embeds: torch.Tensor,
321
+ negative_pooled_prompt_embeds: torch.Tensor,
322
+ image: PipelineImageInput = None,
323
+ num_inference_steps: int = 8,
324
+ guidance_scale: float = 1.5,
325
+ controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
326
+ ):
327
+ # 1. Check inputs. Raise error if not correct
328
+ self.check_inputs(
329
+ prompt_embeds,
330
+ negative_prompt_embeds,
331
+ pooled_prompt_embeds,
332
+ negative_pooled_prompt_embeds,
333
+ image,
334
+ controlnet_conditioning_scale,
335
+ )
336
+
337
+ self._guidance_scale = guidance_scale
338
+
339
+ # 2. Define call parameters
340
+ batch_size = 1
341
+ device = self._execution_device
342
+
343
+ # 4. Prepare image
344
+ if isinstance(self.controlnet, ControlNetModel_Union):
345
+ image = self.prepare_image(
346
+ image=image,
347
+ device=device,
348
+ dtype=self.controlnet.dtype,
349
+ do_classifier_free_guidance=self.do_classifier_free_guidance,
350
+ )
351
+ height, width = image.shape[-2:]
352
+ else:
353
+ assert False
354
+
355
+ # 5. Prepare timesteps
356
+ timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device)
357
+ self._num_timesteps = len(timesteps)
358
+
359
+ # 6. Prepare latent variables
360
+ num_channels_latents = self.unet.config.in_channels
361
+ latents = self.prepare_latents(
362
+ batch_size,
363
+ num_channels_latents,
364
+ height,
365
+ width,
366
+ prompt_embeds.dtype,
367
+ device,
368
+ )
369
+
370
+ # 7 Prepare added time ids & embeddings
371
+ add_text_embeds = pooled_prompt_embeds
372
+
373
+ add_time_ids = negative_add_time_ids = torch.tensor(
374
+ image.shape[-2:] + torch.Size([0, 0]) + image.shape[-2:]
375
+ ).unsqueeze(0)
376
+
377
+ if self.do_classifier_free_guidance:
378
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
379
+ add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
380
+ add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
381
+
382
+ prompt_embeds = prompt_embeds.to(device)
383
+ add_text_embeds = add_text_embeds.to(device)
384
+ add_time_ids = add_time_ids.to(device).repeat(batch_size, 1)
385
+
386
+ controlnet_image_list = [0, 0, 0, 0, 0, 0, image, 0]
387
+ union_control_type = (
388
+ torch.Tensor([0, 0, 0, 0, 0, 0, 1, 0]).to(device, dtype=prompt_embeds.dtype).repeat(batch_size * 2, 1)
389
+ )
390
+
391
+ added_cond_kwargs = {
392
+ "text_embeds": add_text_embeds,
393
+ "time_ids": add_time_ids,
394
+ "control_type": union_control_type,
395
+ }
396
+
397
+ controlnet_prompt_embeds = prompt_embeds
398
+ controlnet_added_cond_kwargs = added_cond_kwargs
399
+
400
+ # 8. Denoising loop
401
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
402
+
403
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
404
+ for i, t in enumerate(timesteps):
405
+ # expand the latents if we are doing classifier free guidance
406
+ latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
407
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
408
+
409
+ # controlnet(s) inference
410
+ control_model_input = latent_model_input
411
+
412
+ down_block_res_samples, mid_block_res_sample = self.controlnet(
413
+ control_model_input,
414
+ t,
415
+ encoder_hidden_states=controlnet_prompt_embeds,
416
+ controlnet_cond_list=controlnet_image_list,
417
+ conditioning_scale=controlnet_conditioning_scale,
418
+ guess_mode=False,
419
+ added_cond_kwargs=controlnet_added_cond_kwargs,
420
+ return_dict=False,
421
+ )
422
+
423
+ # predict the noise residual
424
+ noise_pred = self.unet(
425
+ latent_model_input,
426
+ t,
427
+ encoder_hidden_states=prompt_embeds,
428
+ timestep_cond=None,
429
+ cross_attention_kwargs={},
430
+ down_block_additional_residuals=down_block_res_samples,
431
+ mid_block_additional_residual=mid_block_res_sample,
432
+ added_cond_kwargs=added_cond_kwargs,
433
+ return_dict=False,
434
+ )[0]
435
+
436
+ # perform guidance
437
+ if self.do_classifier_free_guidance:
438
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
439
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
440
+
441
+ # compute the previous noisy sample x_t -> x_t-1
442
+ latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
443
+
444
+ if i == 2:
445
+ prompt_embeds = prompt_embeds[-1:]
446
+ add_text_embeds = add_text_embeds[-1:]
447
+ add_time_ids = add_time_ids[-1:]
448
+ union_control_type = union_control_type[-1:]
449
+
450
+ added_cond_kwargs = {
451
+ "text_embeds": add_text_embeds,
452
+ "time_ids": add_time_ids,
453
+ "control_type": union_control_type,
454
+ }
455
+
456
+ controlnet_prompt_embeds = prompt_embeds
457
+ controlnet_added_cond_kwargs = added_cond_kwargs
458
+
459
+ image = image[-1:]
460
+ controlnet_image_list = [0, 0, 0, 0, 0, 0, image, 0]
461
+
462
+ self._guidance_scale = 0.0
463
+
464
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
465
+ progress_bar.update()
466
+
467
+ latents = latents / self.vae.config.scaling_factor
468
+ image = self.vae.decode(latents, return_dict=False)[0]
469
+ image = self.image_processor.postprocess(image)[0]
470
+
471
+ # Offload all models
472
+ self.maybe_free_model_hooks()
473
+
474
+ return image