skytnt commited on
Commit
45d2b45
1 Parent(s): 010c184

Update pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +317 -221
pipeline.py CHANGED
@@ -6,11 +6,10 @@ import numpy as np
6
  import torch
7
 
8
  import PIL
9
- from diffusers.onnx_utils import OnnxRuntimeModel
10
- from diffusers.pipeline_utils import DiffusionPipeline
11
  from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
12
- from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
13
- from diffusers.utils import logging
14
  from transformers import CLIPFeatureExtractor, CLIPTokenizer
15
 
16
 
@@ -38,7 +37,7 @@ re_attention = re.compile(
38
 
39
  def parse_prompt_attention(text):
40
  """
41
- Parses a string with attention tokens and returns a list of pairs: text and its assoicated weight.
42
  Accepted tokens are:
43
  (abc) - increases attention to abc by a multiplier of 1.1
44
  (abc:3.12) - increases attention to abc by a multiplier of 3.12
@@ -236,12 +235,12 @@ def get_weighted_text_embeddings(
236
  r"""
237
  Prompts can be assigned with local weights using brackets. For example,
238
  prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful',
239
- and the embedding tokens corresponding to the words get multipled by a constant, 1.1.
240
 
241
- Also, to regularize of the embedding, the weighted embedding would be scaled to preserve the origional mean.
242
 
243
  Args:
244
- pipe (`DiffusionPipeline`):
245
  Pipe to provide access to the tokenizer and the text encoder.
246
  prompt (`str` or `List[str]`):
247
  The prompt or prompts to guide the image generation.
@@ -365,17 +364,17 @@ def get_weighted_text_embeddings(
365
  def preprocess_image(image):
366
  w, h = image.size
367
  w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32
368
- image = image.resize((w, h), resample=PIL.Image.LANCZOS)
369
  image = np.array(image).astype(np.float32) / 255.0
370
  image = image[None].transpose(0, 3, 1, 2)
371
  return 2.0 * image - 1.0
372
 
373
 
374
- def preprocess_mask(mask):
375
  mask = mask.convert("L")
376
  w, h = mask.size
377
  w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32
378
- mask = mask.resize((w // 8, h // 8), resample=PIL.Image.NEAREST)
379
  mask = np.array(mask).astype(np.float32) / 255.0
380
  mask = np.tile(mask, (4, 1, 1))
381
  mask = mask[None].transpose(0, 1, 2, 3) # what does this step do?
@@ -383,7 +382,7 @@ def preprocess_mask(mask):
383
  return mask
384
 
385
 
386
- class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
387
  r"""
388
  Pipeline for text-to-image generation using Stable Diffusion without tokens length limit, and support parsing
389
  weighting in prompt.
@@ -399,12 +398,12 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
399
  text_encoder: OnnxRuntimeModel,
400
  tokenizer: CLIPTokenizer,
401
  unet: OnnxRuntimeModel,
402
- scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
403
  safety_checker: OnnxRuntimeModel,
404
  feature_extractor: CLIPFeatureExtractor,
 
405
  ):
406
- super().__init__()
407
- self.register_modules(
408
  vae_encoder=vae_encoder,
409
  vae_decoder=vae_decoder,
410
  text_encoder=text_encoder,
@@ -413,14 +412,177 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
413
  scheduler=scheduler,
414
  safety_checker=safety_checker,
415
  feature_extractor=feature_extractor,
 
416
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
417
 
418
  @torch.no_grad()
419
  def __call__(
420
  self,
421
  prompt: Union[str, List[str]],
422
  negative_prompt: Optional[Union[str, List[str]]] = None,
423
- init_image: Union[np.ndarray, PIL.Image.Image] = None,
424
  mask_image: Union[np.ndarray, PIL.Image.Image] = None,
425
  height: int = 512,
426
  width: int = 512,
@@ -429,12 +591,13 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
429
  strength: float = 0.8,
430
  num_images_per_prompt: Optional[int] = 1,
431
  eta: float = 0.0,
432
- generator: Optional[np.random.RandomState] = None,
433
  latents: Optional[np.ndarray] = None,
434
  max_embeddings_multiples: Optional[int] = 3,
435
  output_type: Optional[str] = "pil",
436
  return_dict: bool = True,
437
  callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
 
438
  callback_steps: Optional[int] = 1,
439
  **kwargs,
440
  ):
@@ -447,11 +610,11 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
447
  negative_prompt (`str` or `List[str]`, *optional*):
448
  The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
449
  if `guidance_scale` is less than `1`).
450
- init_image (`np.ndarray` or `PIL.Image.Image`):
451
  `Image`, or tensor representing an image batch, that will be used as the starting point for the
452
  process.
453
  mask_image (`np.ndarray` or `PIL.Image.Image`):
454
- `Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
455
  replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
456
  PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
457
  contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
@@ -469,18 +632,19 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
469
  1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
470
  usually at the expense of lower image quality.
471
  strength (`float`, *optional*, defaults to 0.8):
472
- Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
473
- `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
474
  number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
475
  noise will be maximum and the denoising process will run for the full number of iterations specified in
476
- `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
477
  num_images_per_prompt (`int`, *optional*, defaults to 1):
478
  The number of images to generate per prompt.
479
  eta (`float`, *optional*, defaults to 0.0):
480
  Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
481
  [`schedulers.DDIMScheduler`], will be ignored for others.
482
- generator (`np.random.RandomState`, *optional*):
483
- A np.random.RandomState to make generation deterministic.
 
484
  latents (`np.ndarray`, *optional*):
485
  Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
486
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
@@ -496,213 +660,142 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
496
  callback (`Callable`, *optional*):
497
  A function that will be called every `callback_steps` steps during inference. The function will be
498
  called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`.
 
 
 
499
  callback_steps (`int`, *optional*, defaults to 1):
500
  The frequency at which the `callback` function will be called. If not specified, the callback will be
501
  called at every step.
502
 
503
  Returns:
 
504
  [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
505
  [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
506
  When returning a tuple, the first element is a list with the generated images, and the second element is a
507
  list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
508
  (nsfw) content, according to the `safety_checker`.
509
  """
 
 
 
510
 
511
- if isinstance(prompt, str):
512
- batch_size = 1
513
- prompt = [prompt]
514
- elif isinstance(prompt, list):
515
- batch_size = len(prompt)
516
- else:
517
- raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
518
-
519
- if strength < 0 or strength > 1:
520
- raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
521
-
522
- if height % 8 != 0 or width % 8 != 0:
523
- raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
524
-
525
- if (callback_steps is None) or (
526
- callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
527
- ):
528
- raise ValueError(
529
- f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
530
- f" {type(callback_steps)}."
531
- )
532
 
533
- # get prompt text embeddings
 
534
 
 
 
535
  # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
536
  # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
537
  # corresponds to doing no classifier free guidance.
538
  do_classifier_free_guidance = guidance_scale > 1.0
539
- # get unconditional embeddings for classifier free guidance
540
- if negative_prompt is None:
541
- negative_prompt = [""] * batch_size
542
- elif isinstance(negative_prompt, str):
543
- negative_prompt = [negative_prompt] * batch_size
544
- if batch_size != len(negative_prompt):
545
- raise ValueError(
546
- f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
547
- f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
548
- " the batch size of `prompt`."
549
- )
550
 
551
- if generator is None:
552
- generator = np.random
553
-
554
- text_embeddings, uncond_embeddings = get_weighted_text_embeddings(
555
- pipe=self,
556
- prompt=prompt,
557
- uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
558
- max_embeddings_multiples=max_embeddings_multiples,
559
- **kwargs,
560
  )
561
-
562
- text_embeddings = text_embeddings.repeat(num_images_per_prompt, 0)
563
- if do_classifier_free_guidance:
564
- uncond_embeddings = uncond_embeddings.repeat(num_images_per_prompt, 0)
565
- text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])
566
-
567
- # set timesteps
568
- self.scheduler.set_timesteps(num_inference_steps)
569
-
570
- latents_dtype = text_embeddings.dtype
571
- init_latents_orig = None
572
- mask = None
573
- noise = None
574
-
575
- if init_image is None:
576
- latents_shape = (
577
- batch_size * num_images_per_prompt,
578
- 4,
579
- height // 8,
580
- width // 8,
581
- )
582
-
583
- if latents is None:
584
- latents = generator.randn(*latents_shape).astype(latents_dtype)
585
- elif latents.shape != latents_shape:
586
- raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
587
-
588
- timesteps = self.scheduler.timesteps.to(self.device)
589
-
590
- # scale the initial noise by the standard deviation required by the scheduler
591
- latents = latents * self.scheduler.init_noise_sigma
592
  else:
593
- if isinstance(init_image, PIL.Image.Image):
594
- init_image = preprocess_image(init_image)
595
- # encode the init image into latents and scale the latents
596
- init_image = init_image.astype(latents_dtype)
597
- init_latents = self.vae_encoder(sample=init_image)[0]
598
- init_latents = 0.18215 * init_latents
599
- init_latents = np.concatenate([init_latents] * batch_size * num_images_per_prompt)
600
- init_latents_orig = init_latents
601
-
602
- # preprocess mask
603
- if mask_image is not None:
604
- if isinstance(mask_image, PIL.Image.Image):
605
- mask_image = preprocess_mask(mask_image)
606
- mask_image = mask_image.astype(latents_dtype)
607
- mask = np.concatenate([mask_image] * batch_size * num_images_per_prompt)
608
 
609
- # check sizes
610
- if not mask.shape == init_latents.shape:
611
- print(mask.shape, init_latents.shape)
612
- raise ValueError("The mask and init_image should be the same size!")
613
-
614
- # get the original timestep using init_timestep
615
- offset = self.scheduler.config.get("steps_offset", 0)
616
- init_timestep = int(num_inference_steps * strength) + offset
617
- init_timestep = min(init_timestep, num_inference_steps)
618
-
619
- timesteps = self.scheduler.timesteps[-init_timestep]
620
- timesteps = torch.tensor([timesteps] * batch_size * num_images_per_prompt)
621
-
622
- # add noise to latents using the timesteps
623
- noise = generator.randn(*init_latents.shape).astype(latents_dtype)
624
- latents = self.scheduler.add_noise(
625
- torch.from_numpy(init_latents), torch.from_numpy(noise), timesteps
626
- ).numpy()
627
-
628
- t_start = max(num_inference_steps - init_timestep + offset, 0)
629
- timesteps = self.scheduler.timesteps[t_start:]
630
-
631
- # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
632
- # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
633
- # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
634
- # and should be between [0, 1]
635
- accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
636
- extra_step_kwargs = {}
637
- if accepts_eta:
638
- extra_step_kwargs["eta"] = eta
639
-
640
- for i, t in enumerate(self.progress_bar(timesteps)):
641
- # expand the latents if we are doing classifier free guidance
642
- latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
643
- latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
644
-
645
- # predict the noise residual
646
- noise_pred = self.unet(
647
- sample=latent_model_input,
648
- timestep=np.array([t]),
649
- encoder_hidden_states=text_embeddings,
650
- )
651
- noise_pred = noise_pred[0]
652
-
653
- # perform guidance
654
- if do_classifier_free_guidance:
655
- noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
656
- noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
657
-
658
- # compute the previous noisy sample x_t -> x_t-1
659
- latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample.numpy()
660
-
661
- if mask is not None:
662
- # masking
663
- init_latents_proper = self.scheduler.add_noise(
664
- torch.from_numpy(init_latents_orig),
665
- torch.from_numpy(noise),
666
- torch.tensor([t]),
667
- ).numpy()
668
- latents = (init_latents_proper * mask) + (latents * (1 - mask))
669
-
670
- # call the callback, if provided
671
- if callback is not None and i % callback_steps == 0:
672
- callback(i, t, latents)
673
 
674
- latents = 1 / 0.18215 * latents
675
- # image = self.vae_decoder(latent_sample=latents)[0]
676
- # it seems likes there is a problem for using half-precision vae decoder if batchsize>1
677
- image = []
678
- for i in range(latents.shape[0]):
679
- image.append(self.vae_decoder(latent_sample=latents[i : i + 1])[0])
680
- image = np.concatenate(image)
 
 
 
 
 
 
 
 
 
 
 
 
681
 
682
- image = np.clip(image / 2 + 0.5, 0, 1)
683
- image = image.transpose((0, 2, 3, 1))
 
 
684
 
685
- if self.safety_checker is not None:
686
- safety_checker_input = self.feature_extractor(
687
- self.numpy_to_pil(image), return_tensors="np"
688
- ).pixel_values.astype(image.dtype)
689
- # There will throw an error if use safety_checker directly and batchsize>1
690
- images, has_nsfw_concept = [], []
691
- for i in range(image.shape[0]):
692
- image_i, has_nsfw_concept_i = self.safety_checker(
693
- clip_input=safety_checker_input[i : i + 1], images=image[i : i + 1]
694
  )
695
- images.append(image_i)
696
- has_nsfw_concept.append(has_nsfw_concept_i[0])
697
- image = np.concatenate(images)
698
- else:
699
- has_nsfw_concept = None
700
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
701
  if output_type == "pil":
702
  image = self.numpy_to_pil(image)
703
 
704
  if not return_dict:
705
- return (image, has_nsfw_concept)
706
 
707
  return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
708
 
@@ -716,7 +809,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
716
  guidance_scale: float = 7.5,
717
  num_images_per_prompt: Optional[int] = 1,
718
  eta: float = 0.0,
719
- generator: Optional[np.random.RandomState] = None,
720
  latents: Optional[np.ndarray] = None,
721
  max_embeddings_multiples: Optional[int] = 3,
722
  output_type: Optional[str] = "pil",
@@ -751,8 +844,9 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
751
  eta (`float`, *optional*, defaults to 0.0):
752
  Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
753
  [`schedulers.DDIMScheduler`], will be ignored for others.
754
- generator (`np.random.RandomState`, *optional*):
755
- A np.random.RandomState to make generation deterministic.
 
756
  latents (`np.ndarray`, *optional*):
757
  Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
758
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
@@ -799,7 +893,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
799
 
800
  def img2img(
801
  self,
802
- init_image: Union[np.ndarray, PIL.Image.Image],
803
  prompt: Union[str, List[str]],
804
  negative_prompt: Optional[Union[str, List[str]]] = None,
805
  strength: float = 0.8,
@@ -807,7 +901,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
807
  guidance_scale: Optional[float] = 7.5,
808
  num_images_per_prompt: Optional[int] = 1,
809
  eta: Optional[float] = 0.0,
810
- generator: Optional[np.random.RandomState] = None,
811
  max_embeddings_multiples: Optional[int] = 3,
812
  output_type: Optional[str] = "pil",
813
  return_dict: bool = True,
@@ -818,7 +912,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
818
  r"""
819
  Function for image-to-image generation.
820
  Args:
821
- init_image (`np.ndarray` or `PIL.Image.Image`):
822
  `Image`, or ndarray representing an image batch, that will be used as the starting point for the
823
  process.
824
  prompt (`str` or `List[str]`):
@@ -827,11 +921,11 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
827
  The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
828
  if `guidance_scale` is less than `1`).
829
  strength (`float`, *optional*, defaults to 0.8):
830
- Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
831
- `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
832
  number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
833
  noise will be maximum and the denoising process will run for the full number of iterations specified in
834
- `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
835
  num_inference_steps (`int`, *optional*, defaults to 50):
836
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
837
  expense of slower inference. This parameter will be modulated by `strength`.
@@ -846,8 +940,9 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
846
  eta (`float`, *optional*, defaults to 0.0):
847
  Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
848
  [`schedulers.DDIMScheduler`], will be ignored for others.
849
- generator (`np.random.RandomState`, *optional*):
850
- A np.random.RandomState to make generation deterministic.
 
851
  max_embeddings_multiples (`int`, *optional*, defaults to `3`):
852
  The max multiple length of prompt embeddings compared to the max output length of text encoder.
853
  output_type (`str`, *optional*, defaults to `"pil"`):
@@ -872,7 +967,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
872
  return self.__call__(
873
  prompt=prompt,
874
  negative_prompt=negative_prompt,
875
- init_image=init_image,
876
  num_inference_steps=num_inference_steps,
877
  guidance_scale=guidance_scale,
878
  strength=strength,
@@ -889,7 +984,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
889
 
890
  def inpaint(
891
  self,
892
- init_image: Union[np.ndarray, PIL.Image.Image],
893
  mask_image: Union[np.ndarray, PIL.Image.Image],
894
  prompt: Union[str, List[str]],
895
  negative_prompt: Optional[Union[str, List[str]]] = None,
@@ -898,7 +993,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
898
  guidance_scale: Optional[float] = 7.5,
899
  num_images_per_prompt: Optional[int] = 1,
900
  eta: Optional[float] = 0.0,
901
- generator: Optional[np.random.RandomState] = None,
902
  max_embeddings_multiples: Optional[int] = 3,
903
  output_type: Optional[str] = "pil",
904
  return_dict: bool = True,
@@ -909,11 +1004,11 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
909
  r"""
910
  Function for inpaint.
911
  Args:
912
- init_image (`np.ndarray` or `PIL.Image.Image`):
913
  `Image`, or tensor representing an image batch, that will be used as the starting point for the
914
  process. This is the image whose masked region will be inpainted.
915
  mask_image (`np.ndarray` or `PIL.Image.Image`):
916
- `Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
917
  replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
918
  PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
919
  contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
@@ -925,7 +1020,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
925
  strength (`float`, *optional*, defaults to 0.8):
926
  Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
927
  is 1, the denoising process will be run on the masked area for the full number of iterations specified
928
- in `num_inference_steps`. `init_image` will be used as a reference for the masked area, adding more
929
  noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
930
  num_inference_steps (`int`, *optional*, defaults to 50):
931
  The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
@@ -941,8 +1036,9 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
941
  eta (`float`, *optional*, defaults to 0.0):
942
  Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
943
  [`schedulers.DDIMScheduler`], will be ignored for others.
944
- generator (`np.random.RandomState`, *optional*):
945
- A np.random.RandomState to make generation deterministic.
 
946
  max_embeddings_multiples (`int`, *optional*, defaults to `3`):
947
  The max multiple length of prompt embeddings compared to the max output length of text encoder.
948
  output_type (`str`, *optional*, defaults to `"pil"`):
@@ -967,7 +1063,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
967
  return self.__call__(
968
  prompt=prompt,
969
  negative_prompt=negative_prompt,
970
- init_image=init_image,
971
  mask_image=mask_image,
972
  num_inference_steps=num_inference_steps,
973
  guidance_scale=guidance_scale,
 
6
  import torch
7
 
8
  import PIL
9
+ from diffusers import OnnxStableDiffusionPipeline, SchedulerMixin
10
+ from diffusers.onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel
11
  from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
12
+ from diffusers.utils import PIL_INTERPOLATION, deprecate, logging
 
13
  from transformers import CLIPFeatureExtractor, CLIPTokenizer
14
 
15
 
 
37
 
38
  def parse_prompt_attention(text):
39
  """
40
+ Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
41
  Accepted tokens are:
42
  (abc) - increases attention to abc by a multiplier of 1.1
43
  (abc:3.12) - increases attention to abc by a multiplier of 3.12
 
235
  r"""
236
  Prompts can be assigned with local weights using brackets. For example,
237
  prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful',
238
+ and the embedding tokens corresponding to the words get multiplied by a constant, 1.1.
239
 
240
+ Also, to regularize of the embedding, the weighted embedding would be scaled to preserve the original mean.
241
 
242
  Args:
243
+ pipe (`OnnxStableDiffusionPipeline`):
244
  Pipe to provide access to the tokenizer and the text encoder.
245
  prompt (`str` or `List[str]`):
246
  The prompt or prompts to guide the image generation.
 
364
  def preprocess_image(image):
365
  w, h = image.size
366
  w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32
367
+ image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
368
  image = np.array(image).astype(np.float32) / 255.0
369
  image = image[None].transpose(0, 3, 1, 2)
370
  return 2.0 * image - 1.0
371
 
372
 
373
+ def preprocess_mask(mask, scale_factor=8):
374
  mask = mask.convert("L")
375
  w, h = mask.size
376
  w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32
377
+ mask = mask.resize((w // scale_factor, h // scale_factor), resample=PIL.Image.NEAREST)
378
  mask = np.array(mask).astype(np.float32) / 255.0
379
  mask = np.tile(mask, (4, 1, 1))
380
  mask = mask[None].transpose(0, 1, 2, 3) # what does this step do?
 
382
  return mask
383
 
384
 
385
+ class OnnxStableDiffusionLongPromptWeightingPipeline(OnnxStableDiffusionPipeline):
386
  r"""
387
  Pipeline for text-to-image generation using Stable Diffusion without tokens length limit, and support parsing
388
  weighting in prompt.
 
398
  text_encoder: OnnxRuntimeModel,
399
  tokenizer: CLIPTokenizer,
400
  unet: OnnxRuntimeModel,
401
+ scheduler: SchedulerMixin,
402
  safety_checker: OnnxRuntimeModel,
403
  feature_extractor: CLIPFeatureExtractor,
404
+ requires_safety_checker: bool = True,
405
  ):
406
+ super().__init__(
 
407
  vae_encoder=vae_encoder,
408
  vae_decoder=vae_decoder,
409
  text_encoder=text_encoder,
 
412
  scheduler=scheduler,
413
  safety_checker=safety_checker,
414
  feature_extractor=feature_extractor,
415
+ requires_safety_checker=requires_safety_checker,
416
  )
417
+ self.unet_in_channels = 4
418
+ self.vae_scale_factor = 8
419
+
420
+ def _encode_prompt(
421
+ self,
422
+ prompt,
423
+ num_images_per_prompt,
424
+ do_classifier_free_guidance,
425
+ negative_prompt,
426
+ max_embeddings_multiples,
427
+ ):
428
+ r"""
429
+ Encodes the prompt into text encoder hidden states.
430
+
431
+ Args:
432
+ prompt (`str` or `list(int)`):
433
+ prompt to be encoded
434
+ num_images_per_prompt (`int`):
435
+ number of images that should be generated per prompt
436
+ do_classifier_free_guidance (`bool`):
437
+ whether to use classifier free guidance or not
438
+ negative_prompt (`str` or `List[str]`):
439
+ The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
440
+ if `guidance_scale` is less than `1`).
441
+ max_embeddings_multiples (`int`, *optional*, defaults to `3`):
442
+ The max multiple length of prompt embeddings compared to the max output length of text encoder.
443
+ """
444
+ batch_size = len(prompt) if isinstance(prompt, list) else 1
445
+
446
+ if negative_prompt is None:
447
+ negative_prompt = [""] * batch_size
448
+ elif isinstance(negative_prompt, str):
449
+ negative_prompt = [negative_prompt] * batch_size
450
+ if batch_size != len(negative_prompt):
451
+ raise ValueError(
452
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
453
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
454
+ " the batch size of `prompt`."
455
+ )
456
+
457
+ text_embeddings, uncond_embeddings = get_weighted_text_embeddings(
458
+ pipe=self,
459
+ prompt=prompt,
460
+ uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
461
+ max_embeddings_multiples=max_embeddings_multiples,
462
+ )
463
+
464
+ text_embeddings = text_embeddings.repeat(num_images_per_prompt, 0)
465
+ if do_classifier_free_guidance:
466
+ uncond_embeddings = uncond_embeddings.repeat(num_images_per_prompt, 0)
467
+ text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])
468
+
469
+ return text_embeddings
470
+
471
+ def check_inputs(self, prompt, height, width, strength, callback_steps):
472
+ if not isinstance(prompt, str) and not isinstance(prompt, list):
473
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
474
+
475
+ if strength < 0 or strength > 1:
476
+ raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
477
+
478
+ if height % 8 != 0 or width % 8 != 0:
479
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
480
+
481
+ if (callback_steps is None) or (
482
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
483
+ ):
484
+ raise ValueError(
485
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
486
+ f" {type(callback_steps)}."
487
+ )
488
+
489
+ def get_timesteps(self, num_inference_steps, strength, is_text2img):
490
+ if is_text2img:
491
+ return self.scheduler.timesteps, num_inference_steps
492
+ else:
493
+ # get the original timestep using init_timestep
494
+ offset = self.scheduler.config.get("steps_offset", 0)
495
+ init_timestep = int(num_inference_steps * strength) + offset
496
+ init_timestep = min(init_timestep, num_inference_steps)
497
+
498
+ t_start = max(num_inference_steps - init_timestep + offset, 0)
499
+ timesteps = self.scheduler.timesteps[t_start:]
500
+ return timesteps, num_inference_steps - t_start
501
+
502
+ def run_safety_checker(self, image):
503
+ if self.safety_checker is not None:
504
+ safety_checker_input = self.feature_extractor(
505
+ self.numpy_to_pil(image), return_tensors="np"
506
+ ).pixel_values.astype(image.dtype)
507
+ # There will throw an error if use safety_checker directly and batchsize>1
508
+ images, has_nsfw_concept = [], []
509
+ for i in range(image.shape[0]):
510
+ image_i, has_nsfw_concept_i = self.safety_checker(
511
+ clip_input=safety_checker_input[i : i + 1], images=image[i : i + 1]
512
+ )
513
+ images.append(image_i)
514
+ has_nsfw_concept.append(has_nsfw_concept_i[0])
515
+ image = np.concatenate(images)
516
+ else:
517
+ has_nsfw_concept = None
518
+ return image, has_nsfw_concept
519
+
520
+ def decode_latents(self, latents):
521
+ latents = 1 / 0.18215 * latents
522
+ # image = self.vae_decoder(latent_sample=latents)[0]
523
+ # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
524
+ image = np.concatenate(
525
+ [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])]
526
+ )
527
+ image = np.clip(image / 2 + 0.5, 0, 1)
528
+ image = image.transpose((0, 2, 3, 1))
529
+ return image
530
+
531
+ def prepare_extra_step_kwargs(self, generator, eta):
532
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
533
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
534
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
535
+ # and should be between [0, 1]
536
+
537
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
538
+ extra_step_kwargs = {}
539
+ if accepts_eta:
540
+ extra_step_kwargs["eta"] = eta
541
+
542
+ # check if the scheduler accepts generator
543
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
544
+ if accepts_generator:
545
+ extra_step_kwargs["generator"] = generator
546
+ return extra_step_kwargs
547
+
548
+ def prepare_latents(self, image, timestep, batch_size, height, width, dtype, generator, latents=None):
549
+ if image is None:
550
+ shape = (
551
+ batch_size,
552
+ self.unet_in_channels,
553
+ height // self.vae_scale_factor,
554
+ width // self.vae_scale_factor,
555
+ )
556
+
557
+ if latents is None:
558
+ latents = torch.randn(shape, generator=generator, device="cpu").numpy().astype(dtype)
559
+ else:
560
+ if latents.shape != shape:
561
+ raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
562
+
563
+ # scale the initial noise by the standard deviation required by the scheduler
564
+ latents = (torch.from_numpy(latents) * self.scheduler.init_noise_sigma).numpy()
565
+ return latents, None, None
566
+ else:
567
+ init_latents = self.vae_encoder(sample=image)[0]
568
+ init_latents = 0.18215 * init_latents
569
+ init_latents = np.concatenate([init_latents] * batch_size, axis=0)
570
+ init_latents_orig = init_latents
571
+ shape = init_latents.shape
572
+
573
+ # add noise to latents using the timesteps
574
+ noise = torch.randn(shape, generator=generator, device="cpu").numpy().astype(dtype)
575
+ latents = self.scheduler.add_noise(
576
+ torch.from_numpy(init_latents), torch.from_numpy(noise), timestep
577
+ ).numpy()
578
+ return latents, init_latents_orig, noise
579
 
580
  @torch.no_grad()
581
  def __call__(
582
  self,
583
  prompt: Union[str, List[str]],
584
  negative_prompt: Optional[Union[str, List[str]]] = None,
585
+ image: Union[np.ndarray, PIL.Image.Image] = None,
586
  mask_image: Union[np.ndarray, PIL.Image.Image] = None,
587
  height: int = 512,
588
  width: int = 512,
 
591
  strength: float = 0.8,
592
  num_images_per_prompt: Optional[int] = 1,
593
  eta: float = 0.0,
594
+ generator: Optional[torch.Generator] = None,
595
  latents: Optional[np.ndarray] = None,
596
  max_embeddings_multiples: Optional[int] = 3,
597
  output_type: Optional[str] = "pil",
598
  return_dict: bool = True,
599
  callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
600
+ is_cancelled_callback: Optional[Callable[[], bool]] = None,
601
  callback_steps: Optional[int] = 1,
602
  **kwargs,
603
  ):
 
610
  negative_prompt (`str` or `List[str]`, *optional*):
611
  The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
612
  if `guidance_scale` is less than `1`).
613
+ image (`np.ndarray` or `PIL.Image.Image`):
614
  `Image`, or tensor representing an image batch, that will be used as the starting point for the
615
  process.
616
  mask_image (`np.ndarray` or `PIL.Image.Image`):
617
+ `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
618
  replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
619
  PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
620
  contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
 
632
  1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
633
  usually at the expense of lower image quality.
634
  strength (`float`, *optional*, defaults to 0.8):
635
+ Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
636
+ `image` will be used as a starting point, adding more noise to it the larger the `strength`. The
637
  number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
638
  noise will be maximum and the denoising process will run for the full number of iterations specified in
639
+ `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
640
  num_images_per_prompt (`int`, *optional*, defaults to 1):
641
  The number of images to generate per prompt.
642
  eta (`float`, *optional*, defaults to 0.0):
643
  Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
644
  [`schedulers.DDIMScheduler`], will be ignored for others.
645
+ generator (`torch.Generator`, *optional*):
646
+ A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
647
+ deterministic.
648
  latents (`np.ndarray`, *optional*):
649
  Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
650
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
 
660
  callback (`Callable`, *optional*):
661
  A function that will be called every `callback_steps` steps during inference. The function will be
662
  called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`.
663
+ is_cancelled_callback (`Callable`, *optional*):
664
+ A function that will be called every `callback_steps` steps during inference. If the function returns
665
+ `True`, the inference will be cancelled.
666
  callback_steps (`int`, *optional*, defaults to 1):
667
  The frequency at which the `callback` function will be called. If not specified, the callback will be
668
  called at every step.
669
 
670
  Returns:
671
+ `None` if cancelled by `is_cancelled_callback`,
672
  [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
673
  [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
674
  When returning a tuple, the first element is a list with the generated images, and the second element is a
675
  list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
676
  (nsfw) content, according to the `safety_checker`.
677
  """
678
+ message = "Please use `image` instead of `init_image`."
679
+ init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
680
+ image = init_image or image
681
 
682
+ # 0. Default height and width to unet
683
+ height = height or self.unet.config.sample_size * self.vae_scale_factor
684
+ width = width or self.unet.config.sample_size * self.vae_scale_factor
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
685
 
686
+ # 1. Check inputs. Raise error if not correct
687
+ self.check_inputs(prompt, height, width, strength, callback_steps)
688
 
689
+ # 2. Define call parameters
690
+ batch_size = 1 if isinstance(prompt, str) else len(prompt)
691
  # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
692
  # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
693
  # corresponds to doing no classifier free guidance.
694
  do_classifier_free_guidance = guidance_scale > 1.0
 
 
 
 
 
 
 
 
 
 
 
695
 
696
+ # 3. Encode input prompt
697
+ text_embeddings = self._encode_prompt(
698
+ prompt,
699
+ num_images_per_prompt,
700
+ do_classifier_free_guidance,
701
+ negative_prompt,
702
+ max_embeddings_multiples,
 
 
703
  )
704
+ dtype = text_embeddings.dtype
705
+
706
+ # 4. Preprocess image and mask
707
+ if isinstance(image, PIL.Image.Image):
708
+ image = preprocess_image(image)
709
+ if image is not None:
710
+ image = image.astype(dtype)
711
+ if isinstance(mask_image, PIL.Image.Image):
712
+ mask_image = preprocess_mask(mask_image, self.vae_scale_factor)
713
+ if mask_image is not None:
714
+ mask = mask_image.astype(dtype)
715
+ mask = np.concatenate([mask] * batch_size * num_images_per_prompt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
716
  else:
717
+ mask = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
718
 
719
+ # 5. set timesteps
720
+ self.scheduler.set_timesteps(num_inference_steps)
721
+ timestep_dtype = next(
722
+ (input.type for input in self.unet.model.get_inputs() if input.name == "timestep"), "tensor(float)"
723
+ )
724
+ timestep_dtype = ORT_TO_NP_TYPE[timestep_dtype]
725
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, image is None)
726
+ latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
727
+
728
+ # 6. Prepare latent variables
729
+ latents, init_latents_orig, noise = self.prepare_latents(
730
+ image,
731
+ latent_timestep,
732
+ batch_size * num_images_per_prompt,
733
+ height,
734
+ width,
735
+ dtype,
736
+ generator,
737
+ latents,
738
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
739
 
740
+ # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
741
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
742
+
743
+ # 8. Denoising loop
744
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
745
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
746
+ for i, t in enumerate(timesteps):
747
+ # expand the latents if we are doing classifier free guidance
748
+ latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
749
+ latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t)
750
+ latent_model_input = latent_model_input.numpy()
751
+
752
+ # predict the noise residual
753
+ noise_pred = self.unet(
754
+ sample=latent_model_input,
755
+ timestep=np.array([t], dtype=timestep_dtype),
756
+ encoder_hidden_states=text_embeddings,
757
+ )
758
+ noise_pred = noise_pred[0]
759
 
760
+ # perform guidance
761
+ if do_classifier_free_guidance:
762
+ noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
763
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
764
 
765
+ # compute the previous noisy sample x_t -> x_t-1
766
+ scheduler_output = self.scheduler.step(
767
+ torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs
 
 
 
 
 
 
768
  )
769
+ latents = scheduler_output.prev_sample.numpy()
770
+
771
+ if mask is not None:
772
+ # masking
773
+ init_latents_proper = self.scheduler.add_noise(
774
+ torch.from_numpy(init_latents_orig),
775
+ torch.from_numpy(noise),
776
+ t,
777
+ ).numpy()
778
+ latents = (init_latents_proper * mask) + (latents * (1 - mask))
779
+
780
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
781
+ progress_bar.update()
782
+ if i % callback_steps == 0:
783
+ if callback is not None:
784
+ callback(i, t, latents)
785
+ if is_cancelled_callback is not None and is_cancelled_callback():
786
+ return None
787
+ # 9. Post-processing
788
+ image = self.decode_latents(latents)
789
+
790
+ # 10. Run safety checker
791
+ image, has_nsfw_concept = self.run_safety_checker(image)
792
+
793
+ # 11. Convert to PIL
794
  if output_type == "pil":
795
  image = self.numpy_to_pil(image)
796
 
797
  if not return_dict:
798
+ return image, has_nsfw_concept
799
 
800
  return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
801
 
 
809
  guidance_scale: float = 7.5,
810
  num_images_per_prompt: Optional[int] = 1,
811
  eta: float = 0.0,
812
+ generator: Optional[torch.Generator] = None,
813
  latents: Optional[np.ndarray] = None,
814
  max_embeddings_multiples: Optional[int] = 3,
815
  output_type: Optional[str] = "pil",
 
844
  eta (`float`, *optional*, defaults to 0.0):
845
  Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
846
  [`schedulers.DDIMScheduler`], will be ignored for others.
847
+ generator (`torch.Generator`, *optional*):
848
+ A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
849
+ deterministic.
850
  latents (`np.ndarray`, *optional*):
851
  Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
852
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
 
893
 
894
  def img2img(
895
  self,
896
+ image: Union[np.ndarray, PIL.Image.Image],
897
  prompt: Union[str, List[str]],
898
  negative_prompt: Optional[Union[str, List[str]]] = None,
899
  strength: float = 0.8,
 
901
  guidance_scale: Optional[float] = 7.5,
902
  num_images_per_prompt: Optional[int] = 1,
903
  eta: Optional[float] = 0.0,
904
+ generator: Optional[torch.Generator] = None,
905
  max_embeddings_multiples: Optional[int] = 3,
906
  output_type: Optional[str] = "pil",
907
  return_dict: bool = True,
 
912
  r"""
913
  Function for image-to-image generation.
914
  Args:
915
+ image (`np.ndarray` or `PIL.Image.Image`):
916
  `Image`, or ndarray representing an image batch, that will be used as the starting point for the
917
  process.
918
  prompt (`str` or `List[str]`):
 
921
  The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
922
  if `guidance_scale` is less than `1`).
923
  strength (`float`, *optional*, defaults to 0.8):
924
+ Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
925
+ `image` will be used as a starting point, adding more noise to it the larger the `strength`. The
926
  number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
927
  noise will be maximum and the denoising process will run for the full number of iterations specified in
928
+ `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
929
  num_inference_steps (`int`, *optional*, defaults to 50):
930
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
931
  expense of slower inference. This parameter will be modulated by `strength`.
 
940
  eta (`float`, *optional*, defaults to 0.0):
941
  Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
942
  [`schedulers.DDIMScheduler`], will be ignored for others.
943
+ generator (`torch.Generator`, *optional*):
944
+ A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
945
+ deterministic.
946
  max_embeddings_multiples (`int`, *optional*, defaults to `3`):
947
  The max multiple length of prompt embeddings compared to the max output length of text encoder.
948
  output_type (`str`, *optional*, defaults to `"pil"`):
 
967
  return self.__call__(
968
  prompt=prompt,
969
  negative_prompt=negative_prompt,
970
+ image=image,
971
  num_inference_steps=num_inference_steps,
972
  guidance_scale=guidance_scale,
973
  strength=strength,
 
984
 
985
  def inpaint(
986
  self,
987
+ image: Union[np.ndarray, PIL.Image.Image],
988
  mask_image: Union[np.ndarray, PIL.Image.Image],
989
  prompt: Union[str, List[str]],
990
  negative_prompt: Optional[Union[str, List[str]]] = None,
 
993
  guidance_scale: Optional[float] = 7.5,
994
  num_images_per_prompt: Optional[int] = 1,
995
  eta: Optional[float] = 0.0,
996
+ generator: Optional[torch.Generator] = None,
997
  max_embeddings_multiples: Optional[int] = 3,
998
  output_type: Optional[str] = "pil",
999
  return_dict: bool = True,
 
1004
  r"""
1005
  Function for inpaint.
1006
  Args:
1007
+ image (`np.ndarray` or `PIL.Image.Image`):
1008
  `Image`, or tensor representing an image batch, that will be used as the starting point for the
1009
  process. This is the image whose masked region will be inpainted.
1010
  mask_image (`np.ndarray` or `PIL.Image.Image`):
1011
+ `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
1012
  replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
1013
  PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
1014
  contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
 
1020
  strength (`float`, *optional*, defaults to 0.8):
1021
  Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
1022
  is 1, the denoising process will be run on the masked area for the full number of iterations specified
1023
+ in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more
1024
  noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
1025
  num_inference_steps (`int`, *optional*, defaults to 50):
1026
  The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
 
1036
  eta (`float`, *optional*, defaults to 0.0):
1037
  Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
1038
  [`schedulers.DDIMScheduler`], will be ignored for others.
1039
+ generator (`torch.Generator`, *optional*):
1040
+ A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
1041
+ deterministic.
1042
  max_embeddings_multiples (`int`, *optional*, defaults to `3`):
1043
  The max multiple length of prompt embeddings compared to the max output length of text encoder.
1044
  output_type (`str`, *optional*, defaults to `"pil"`):
 
1063
  return self.__call__(
1064
  prompt=prompt,
1065
  negative_prompt=negative_prompt,
1066
+ image=image,
1067
  mask_image=mask_image,
1068
  num_inference_steps=num_inference_steps,
1069
  guidance_scale=guidance_scale,