waifu-research-department
/

long-prompt-weighting-pipeline

Text-to-Image

Diffusers

English

stable-diffusion

Model card Files Files and versions Community

skytnt commited on Oct 20, 2022

Commit

5c94345

•

1 Parent(s): d013b15

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +60 -25

pipeline.py CHANGED Viewed

@@ -1,18 +1,20 @@
 import inspect
 import re
-import PIL
 import numpy as np
 import torch
-from typing import Callable, List, Optional, Union
-from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 from diffusers.configuration_utils import FrozenDict
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.pipeline_utils import DiffusionPipeline
-from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
-from diffusers.utils import deprecate, logging
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -130,6 +132,7 @@ def get_prompts_with_weights(pipe: DiffusionPipeline, prompt: List[str], max_len
     """
     tokens = []
     weights = []
     for text in prompt:
         texts_and_weights = parse_prompt_attention(text)
         text_token = []
@@ -138,21 +141,21 @@ def get_prompts_with_weights(pipe: DiffusionPipeline, prompt: List[str], max_len
             # tokenize and discard the starting and the ending token
             token = pipe.tokenizer(word).input_ids[1:-1]
             text_token += token
             # copy the weight by length of token
             text_weight += [weight] * len(token)
             # stop if the text is too long (longer than truncation limit)
             if len(text_token) > max_length:
                 break
         # truncate
         if len(text_token) > max_length:
             text_token = text_token[:max_length]
             text_weight = text_weight[:max_length]
         tokens.append(text_token)
         weights.append(text_weight)
     return tokens, weights
@@ -171,9 +174,9 @@ def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, no_boseos_midd
             if len(weights[i]) == 0:
                 w = [1.0] * weights_length
             else:
-                for j in range((len(weights[i]) - 1) // chunk_length + 1):
                     w.append(1.0)  # weight for starting token in this chunk
-                    w += weights[i][j * chunk_length : min(len(weights[i]), (j + 1) * chunk_length)]
                     w.append(1.0)  # weight for ending token in this chunk
                 w += [1.0] * (weights_length - len(w))
             weights[i] = w[:]
@@ -182,7 +185,10 @@ def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, no_boseos_midd
 def get_unweighted_text_embeddings(
-    pipe: DiffusionPipeline, text_input: torch.Tensor, chunk_length: int, no_boseos_middle: Optional[bool] = True
 ):
     """
     When the length of tokens is a multiple of the capacity of the text encoder,
@@ -283,7 +289,8 @@ def get_weighted_text_embeddings(
         max_length = max(max_length, max([len(token) for token in uncond_tokens]))
     max_embeddings_multiples = min(
-        max_embeddings_multiples, (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1
     )
     max_embeddings_multiples = max(1, max_embeddings_multiples)
     max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
@@ -315,12 +322,18 @@ def get_weighted_text_embeddings(
     # get the embeddings
     text_embeddings = get_unweighted_text_embeddings(
-        pipe, prompt_tokens, pipe.tokenizer.model_max_length, no_boseos_middle=no_boseos_middle
     )
     prompt_weights = torch.tensor(prompt_weights, dtype=text_embeddings.dtype, device=pipe.device)
     if uncond_prompt is not None:
         uncond_embeddings = get_unweighted_text_embeddings(
-            pipe, uncond_tokens, pipe.tokenizer.model_max_length, no_boseos_middle=no_boseos_middle
         )
         uncond_weights = torch.tensor(uncond_weights, dtype=uncond_embeddings.dtype, device=pipe.device)
@@ -630,16 +643,29 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
             # Unlike in other pipelines, latents need to be generated in the target device
             # for 1-to-1 results reproducibility with the CompVis implementation.
             # However this currently doesn't work in `mps`.
-            latents_shape = (batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8)
             if latents is None:
                 if self.device.type == "mps":
                     # randn does not exist on mps
-                    latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
-                        self.device
-                    )
                 else:
-                    latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
             else:
                 if latents.shape != latents_shape:
                     raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
@@ -682,11 +708,19 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
             # add noise to latents using the timesteps
             if self.device.type == "mps":
                 # randn does not exist on mps
-                noise = torch.randn(init_latents.shape, generator=generator, device="cpu", dtype=latents_dtype).to(
-                    self.device
-                )
             else:
-                noise = torch.randn(init_latents.shape, generator=generator, device=self.device, dtype=latents_dtype)
             latents = self.scheduler.add_noise(init_latents, noise, timesteps)
             t_start = max(num_inference_steps - init_timestep + offset, 0)
@@ -739,7 +773,8 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
                 self.device
             )
             image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
             )
         else:
             has_nsfw_concept = None

 import inspect
 import re
+from typing import Callable, List, Optional, Union
 import numpy as np
 import torch
+import PIL
 from diffusers.configuration_utils import FrozenDict
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.pipeline_utils import DiffusionPipeline
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from diffusers.utils import deprecate, logging
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
     """
     tokens = []
     weights = []
+    truncated = False
     for text in prompt:
         texts_and_weights = parse_prompt_attention(text)
         text_token = []
             # tokenize and discard the starting and the ending token
             token = pipe.tokenizer(word).input_ids[1:-1]
             text_token += token
             # copy the weight by length of token
             text_weight += [weight] * len(token)
             # stop if the text is too long (longer than truncation limit)
             if len(text_token) > max_length:
+                truncated = True
                 break
         # truncate
         if len(text_token) > max_length:
+            truncated = True
             text_token = text_token[:max_length]
             text_weight = text_weight[:max_length]
         tokens.append(text_token)
         weights.append(text_weight)
+    if truncated:
+        logger.warning("Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples")
     return tokens, weights
             if len(weights[i]) == 0:
                 w = [1.0] * weights_length
             else:
+                for j in range(max_embeddings_multiples):
                     w.append(1.0)  # weight for starting token in this chunk
+                    w += weights[i][j * (chunk_length - 2) : min(len(weights[i]), (j + 1) * (chunk_length - 2))]
                     w.append(1.0)  # weight for ending token in this chunk
                 w += [1.0] * (weights_length - len(w))
             weights[i] = w[:]
 def get_unweighted_text_embeddings(
+    pipe: DiffusionPipeline,
+    text_input: torch.Tensor,
+    chunk_length: int,
+    no_boseos_middle: Optional[bool] = True,
 ):
     """
     When the length of tokens is a multiple of the capacity of the text encoder,
         max_length = max(max_length, max([len(token) for token in uncond_tokens]))
     max_embeddings_multiples = min(
+        max_embeddings_multiples,
+        (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1,
     )
     max_embeddings_multiples = max(1, max_embeddings_multiples)
     max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
     # get the embeddings
     text_embeddings = get_unweighted_text_embeddings(
+        pipe,
+        prompt_tokens,
+        pipe.tokenizer.model_max_length,
+        no_boseos_middle=no_boseos_middle,
     )
     prompt_weights = torch.tensor(prompt_weights, dtype=text_embeddings.dtype, device=pipe.device)
     if uncond_prompt is not None:
         uncond_embeddings = get_unweighted_text_embeddings(
+            pipe,
+            uncond_tokens,
+            pipe.tokenizer.model_max_length,
+            no_boseos_middle=no_boseos_middle,
         )
         uncond_weights = torch.tensor(uncond_weights, dtype=uncond_embeddings.dtype, device=pipe.device)
             # Unlike in other pipelines, latents need to be generated in the target device
             # for 1-to-1 results reproducibility with the CompVis implementation.
             # However this currently doesn't work in `mps`.
+            latents_shape = (
+                batch_size * num_images_per_prompt,
+                self.unet.in_channels,
+                height // 8,
+                width // 8,
+            )
             if latents is None:
                 if self.device.type == "mps":
                     # randn does not exist on mps
+                    latents = torch.randn(
+                        latents_shape,
+                        generator=generator,
+                        device="cpu",
+                        dtype=latents_dtype,
+                    ).to(self.device)
                 else:
+                    latents = torch.randn(
+                        latents_shape,
+                        generator=generator,
+                        device=self.device,
+                        dtype=latents_dtype,
+                    )
             else:
                 if latents.shape != latents_shape:
                     raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
             # add noise to latents using the timesteps
             if self.device.type == "mps":
                 # randn does not exist on mps
+                noise = torch.randn(
+                    init_latents.shape,
+                    generator=generator,
+                    device="cpu",
+                    dtype=latents_dtype,
+                ).to(self.device)
             else:
+                noise = torch.randn(
+                    init_latents.shape,
+                    generator=generator,
+                    device=self.device,
+                    dtype=latents_dtype,
+                )
             latents = self.scheduler.add_noise(init_latents, noise, timesteps)
             t_start = max(num_inference_steps - init_timestep + offset, 0)
                 self.device
             )
             image, has_nsfw_concept = self.safety_checker(
+                images=image,
+                clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype),
             )
         else:
             has_nsfw_concept = None