BlockDetail commited on
Commit
2ad3800
1 Parent(s): 0017a3b
Files changed (3) hide show
  1. app.py +1 -1
  2. extension.py +859 -0
  3. requirements.txt +2 -27
app.py CHANGED
@@ -4,7 +4,7 @@ import torch
4
  import numpy as np
5
  import cv2
6
  from PIL import Image, ImageFilter
7
- from interface.extension import CustomStableDiffusionControlNetPipeline
8
 
9
  negative_prompt = ""
10
  device = torch.device('cuda')
 
4
  import numpy as np
5
  import cv2
6
  from PIL import Image, ImageFilter
7
+ from extension import CustomStableDiffusionControlNetPipeline
8
 
9
  negative_prompt = ""
10
  device = torch.device('cuda')
extension.py ADDED
@@ -0,0 +1,859 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
2
+ import torch
3
+ import numpy as np
4
+ from typing import Any, Callable, Dict, List, Optional, Union
5
+ import sys
6
+ import PIL
7
+ import os
8
+ import inspect
9
+ import diffusers
10
+ path = inspect.getfile(diffusers)
11
+ print(path)
12
+ sys.path.append(os.path.join(path, "pipelines/controlnet"))
13
+ sys.path.append(path)
14
+ from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
15
+ from diffusers.utils import replace_example_docstring
16
+ from diffusers.image_processor import PipelineImageInput
17
+ from diffusers.utils.torch_utils import is_compiled_module
18
+ from diffusers.loaders import TextualInversionLoaderMixin, LoraLoaderMixin
19
+ from diffusers.utils.peft_utils import unscale_lora_layers
20
+ from diffusers.utils.import_utils import is_torch_version
21
+ from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
22
+ from diffusers.utils import PIL_INTERPOLATION
23
+ EXAMPLE_DOC_STRING = ""
24
+ USE_PEFT_BACKEND = False
25
+
26
+ class CustomStableDiffusionControlNetPipeline(StableDiffusionControlNetPipeline):
27
+ def encode_prompt(
28
+ self,
29
+ prompt,
30
+ device,
31
+ num_images_per_prompt,
32
+ do_classifier_free_guidance,
33
+ negative_prompt=None,
34
+ prompt_embeds: Optional[torch.FloatTensor] = None,
35
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
36
+ lora_scale: Optional[float] = None,
37
+ clip_skip: Optional[int] = None,
38
+ ):
39
+ r"""
40
+ Encodes the prompt into text encoder hidden states.
41
+
42
+ Args:
43
+ prompt (`str` or `List[str]`, *optional*):
44
+ prompt to be encoded
45
+ device: (`torch.device`):
46
+ torch device
47
+ num_images_per_prompt (`int`):
48
+ number of images that should be generated per prompt
49
+ do_classifier_free_guidance (`bool`):
50
+ whether to use classifier free guidance or not
51
+ negative_prompt (`str` or `List[str]`, *optional*):
52
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
53
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
54
+ less than `1`).
55
+ prompt_embeds (`torch.FloatTensor`, *optional*):
56
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
57
+ provided, text embeddings will be generated from `prompt` input argument.
58
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
59
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
60
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
61
+ argument.
62
+ lora_scale (`float`, *optional*):
63
+ A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
64
+ clip_skip (`int`, *optional*):
65
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
66
+ the output of the pre-final layer will be used for computing the prompt embeddings.
67
+ """
68
+ # set lora scale so that monkey patched LoRA
69
+ # function of text encoder can correctly access it
70
+ if lora_scale is not None and isinstance(self, LoraLoaderMixin):
71
+ self._lora_scale = lora_scale
72
+
73
+ # dynamically adjust the LoRA scale
74
+ if not USE_PEFT_BACKEND:
75
+ adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
76
+ else:
77
+ scale_lora_layers(self.text_encoder, lora_scale)
78
+
79
+ if prompt is not None and isinstance(prompt, str):
80
+ batch_size = 1
81
+ elif prompt is not None and isinstance(prompt, list):
82
+ batch_size = len(prompt)
83
+ else:
84
+ batch_size = prompt_embeds.shape[0]
85
+
86
+ if prompt_embeds is None:
87
+ # textual inversion: procecss multi-vector tokens if necessary
88
+ if isinstance(self, TextualInversionLoaderMixin):
89
+ prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
90
+
91
+ text_inputs = self.tokenizer(
92
+ prompt,
93
+ padding="max_length",
94
+ max_length=self.tokenizer.model_max_length,
95
+ truncation=True,
96
+ return_tensors="pt",
97
+ )
98
+ text_input_ids = text_inputs.input_ids
99
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
100
+
101
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
102
+ text_input_ids, untruncated_ids
103
+ ):
104
+ removed_text = self.tokenizer.batch_decode(
105
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
106
+ )
107
+ logger.warning(
108
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
109
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
110
+ )
111
+
112
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
113
+ attention_mask = text_inputs.attention_mask.to(device)
114
+ else:
115
+ attention_mask = None
116
+
117
+ if clip_skip is None:
118
+ prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
119
+ prompt_embeds = prompt_embeds[0]
120
+ else:
121
+ prompt_embeds = self.text_encoder(
122
+ text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
123
+ )
124
+ # Access the `hidden_states` first, that contains a tuple of
125
+ # all the hidden states from the encoder layers. Then index into
126
+ # the tuple to access the hidden states from the desired layer.
127
+ prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
128
+ # We also need to apply the final LayerNorm here to not mess with the
129
+ # representations. The `last_hidden_states` that we typically use for
130
+ # obtaining the final prompt representations passes through the LayerNorm
131
+ # layer.
132
+ prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
133
+
134
+ if self.text_encoder is not None:
135
+ prompt_embeds_dtype = self.text_encoder.dtype
136
+ elif self.unet is not None:
137
+ prompt_embeds_dtype = self.unet.dtype
138
+ else:
139
+ prompt_embeds_dtype = prompt_embeds.dtype
140
+
141
+ prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
142
+
143
+ bs_embed, seq_len, _ = prompt_embeds.shape
144
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
145
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
146
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
147
+
148
+ # get unconditional embeddings for classifier free guidance
149
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
150
+ uncond_tokens: List[str]
151
+ if negative_prompt is None:
152
+ uncond_tokens = [""] * batch_size
153
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
154
+ raise TypeError(
155
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
156
+ f" {type(prompt)}."
157
+ )
158
+ elif isinstance(negative_prompt, str):
159
+ uncond_tokens = [negative_prompt]
160
+ elif batch_size != len(negative_prompt):
161
+ raise ValueError(
162
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
163
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
164
+ " the batch size of `prompt`."
165
+ )
166
+ else:
167
+ uncond_tokens = negative_prompt
168
+
169
+ # textual inversion: procecss multi-vector tokens if necessary
170
+ if isinstance(self, TextualInversionLoaderMixin):
171
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
172
+
173
+ max_length = prompt_embeds.shape[1]
174
+ uncond_input = self.tokenizer(
175
+ uncond_tokens,
176
+ padding="max_length",
177
+ max_length=max_length,
178
+ truncation=True,
179
+ return_tensors="pt",
180
+ )
181
+
182
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
183
+ attention_mask = uncond_input.attention_mask.to(device)
184
+ else:
185
+ attention_mask = None
186
+
187
+ negative_prompt_embeds = self.text_encoder(
188
+ uncond_input.input_ids.to(device),
189
+ attention_mask=attention_mask,
190
+ )
191
+ negative_prompt_embeds = negative_prompt_embeds[0]
192
+
193
+ if do_classifier_free_guidance:
194
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
195
+ seq_len = negative_prompt_embeds.shape[1]
196
+
197
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
198
+
199
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
200
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
201
+
202
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
203
+ # Retrieve the original scale by scaling back the LoRA layers
204
+ unscale_lora_layers(self.text_encoder, lora_scale)
205
+ # print(prompt_embeds.shape, negative_prompt_embeds.shape)
206
+ return prompt_embeds, negative_prompt_embeds
207
+
208
+ def prepare_latents_legacy(
209
+ self,
210
+ image,
211
+ timestep,
212
+ batch_size,
213
+ num_images_per_prompt,
214
+ dtype,
215
+ device,
216
+ generator,
217
+ ):
218
+ image = image.to(device=self.device, dtype=dtype)
219
+ init_latent_dist = self.vae.encode(image).latent_dist
220
+ init_latents = init_latent_dist.sample(generator=generator)
221
+ init_latents = 0.18215 * init_latents
222
+
223
+ # Expand init_latents for batch_size and num_images_per_prompt
224
+ init_latents = torch.cat(
225
+ [init_latents] * batch_size * num_images_per_prompt, dim=0
226
+ )
227
+ init_latents_orig = init_latents
228
+
229
+ # add noise to latents using the timesteps
230
+ noise = torch.randn(
231
+ init_latents.shape, generator=generator, device=self.device, dtype=dtype
232
+ )
233
+ print(init_latents.shape, noise.shape)
234
+ init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
235
+ latents = init_latents
236
+ return latents, init_latents_orig, noise
237
+
238
+ def preprocess_mask(self, mask, scale_factor=8):
239
+ mask = mask.convert("L")
240
+ w, h = mask.size
241
+ w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32
242
+ mask = mask.resize(
243
+ (w // scale_factor, h // scale_factor), resample=PIL_INTERPOLATION["nearest"]
244
+ )
245
+ # print("in preprocess mask 0", np.unique(mask))
246
+ mask = np.array(mask).astype(np.float32) / 255.0
247
+ mask = np.tile(mask, (4, 1, 1))
248
+ # print("in preprocess mask 1", np.unique(mask))
249
+ mask = mask[None].transpose(0, 1, 2, 3)
250
+ mask = 1 - mask # repaint white, keep black
251
+ # print("in preprocess mask 2", np.unique(mask))
252
+ mask = torch.from_numpy(mask)
253
+ return mask
254
+
255
+ def preprocess_image(self, image):
256
+ w, h = image.size
257
+ w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32
258
+ image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
259
+ image = np.array(image).astype(np.float32) / 255.0
260
+ image = image[None].transpose(0, 3, 1, 2)
261
+ image = torch.from_numpy(image)
262
+ return 2.0 * image - 1.0
263
+
264
+ def get_timesteps(self, num_inference_steps, strength, device):
265
+ # get the original timestep using init_timestep
266
+ offset = self.scheduler.config.get("steps_offset", 0)
267
+ init_timestep = int(num_inference_steps * strength) + offset
268
+ init_timestep = min(init_timestep, num_inference_steps)
269
+
270
+ t_start = max(num_inference_steps - init_timestep + offset, 0)
271
+ timesteps = self.scheduler.timesteps[t_start:]
272
+
273
+ return timesteps, num_inference_steps - t_start
274
+
275
+ @torch.no_grad()
276
+ def collage(
277
+ self,
278
+ prompt: Union[str, List[str]],
279
+ image: Union[torch.FloatTensor, PIL.Image.Image],
280
+ mask_image: Union[torch.FloatTensor, PIL.Image.Image],
281
+ strength: float = 0.8,
282
+ num_inference_steps: Optional[int] = 50,
283
+ guidance_scale: Optional[float] = 7.5,
284
+ negative_prompt: Optional[Union[str, List[str]]] = None,
285
+ num_images_per_prompt: Optional[int] = 1,
286
+ eta: Optional[float] = 0.0,
287
+ generator: Optional[torch.Generator] = None,
288
+ output_type: Optional[str] = "pil",
289
+ return_dict: bool = True,
290
+ callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
291
+ callback_steps: Optional[int] = 1,
292
+ attention_mod: Optional[Callable] = None,
293
+ **kwargs,
294
+ ):
295
+ r"""
296
+ Function invoked when calling the pipeline for generation.
297
+ Hello
298
+
299
+ Args:
300
+ prompt (`str` or `List[str]`):
301
+ The prompt or prompts to guide the image generation.
302
+ image (`torch.FloatTensor` or `PIL.Image.Image`):
303
+ `Image`, or tensor representing an image batch, that will be used as the starting point for the
304
+ process. This is the image whose masked region will be inpainted.
305
+ mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
306
+ `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
307
+ replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
308
+ PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
309
+ contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
310
+ strength (`float`, *optional*, defaults to 0.8):
311
+ Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
312
+ is 1, the denoising process will be run on the masked area for the full number of iterations specified
313
+ in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more noise to
314
+ that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
315
+ num_inference_steps (`int`, *optional*, defaults to 50):
316
+ The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
317
+ the expense of slower inference. This parameter will be modulated by `strength`, as explained above.
318
+ guidance_scale (`float`, *optional*, defaults to 7.5):
319
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
320
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
321
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
322
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
323
+ usually at the expense of lower image quality.
324
+ negative_prompt (`str` or `List[str]`, *optional*):
325
+ The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
326
+ if `guidance_scale` is less than `1`).
327
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
328
+ The number of images to generate per prompt.
329
+ eta (`float`, *optional*, defaults to 0.0):
330
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
331
+ [`schedulers.DDIMScheduler`], will be ignored for others.
332
+ generator (`torch.Generator`, *optional*):
333
+ A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
334
+ deterministic.
335
+ output_type (`str`, *optional*, defaults to `"pil"`):
336
+ The output format of the generate image. Choose between
337
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
338
+ return_dict (`bool`, *optional*, defaults to `True`):
339
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
340
+ plain tuple.
341
+ callback (`Callable`, *optional*):
342
+ A function that will be called every `callback_steps` steps during inference. The function will be
343
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
344
+ callback_steps (`int`, *optional*, defaults to 1):
345
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
346
+ called at every step.
347
+
348
+ Returns:
349
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
350
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
351
+ When returning a tuple, the first element is a list with the generated images, and the second element is a
352
+ list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
353
+ (nsfw) content, according to the `safety_checker`.
354
+ """
355
+ # message = "Please use `image` instead of `init_image`."
356
+ # init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
357
+ # image = init_image or image
358
+
359
+ # 1. Check inputs
360
+ # self.check_inputs(prompt, strength, callback_steps)
361
+
362
+ # 2. Define call parameters
363
+ batch_size = 1
364
+ device = self._execution_device
365
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
366
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
367
+ # corresponds to doing no classifier free guidance.
368
+ do_classifier_free_guidance = guidance_scale > 1.0
369
+
370
+ # 3. Encode input prompt
371
+ text_embeddings = self._encode_prompt(
372
+ prompt,
373
+ device,
374
+ num_images_per_prompt,
375
+ do_classifier_free_guidance,
376
+ negative_prompt,
377
+ )
378
+
379
+ # 4. Preprocess image and mask
380
+ if not isinstance(image[0], torch.FloatTensor):
381
+ image = torch.cat([self.preprocess_image(image[i]) for i in range(len(image))], dim=0)
382
+
383
+ if not isinstance(mask_image, torch.FloatTensor):
384
+ mask_image = torch.cat([self.preprocess_mask(mask_image[i], self.vae_scale_factor) for i in range(len(mask_image))], dim=0)
385
+
386
+ # 5. set timesteps
387
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
388
+ timesteps, num_inference_steps = self.get_timesteps(
389
+ num_inference_steps, strength, device
390
+ )
391
+ latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
392
+
393
+ # 6. Prepare latent variables
394
+ # encode the init image into latents and scale the latents
395
+ latents, init_latents_orig, noise = self.prepare_latents_legacy(
396
+ image,
397
+ latent_timestep,
398
+ batch_size,
399
+ num_images_per_prompt,
400
+ text_embeddings.dtype,
401
+ device,
402
+ generator,
403
+ )
404
+
405
+ # 7. Prepare mask latent
406
+ mask = mask_image.to(device=self.device, dtype=latents.dtype)
407
+ mask = torch.cat([mask] * batch_size * num_images_per_prompt)
408
+
409
+ # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
410
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
411
+
412
+ # 9. Denoising loop
413
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
414
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
415
+ for i, t in enumerate(timesteps):
416
+ # expand the latents if we are doing classifier free guidance
417
+ latent_model_input = (
418
+ torch.cat([latents] * 2) if do_classifier_free_guidance else latents
419
+ )
420
+ latent_model_input = self.scheduler.scale_model_input(
421
+ latent_model_input, t
422
+ )
423
+
424
+ if attention_mod is not None:
425
+ sigma = self.scheduler.sigmas[i]
426
+ attention_mod(self.unet, sigma)
427
+
428
+ # predict the noise residual
429
+ noise_pred = self.unet(
430
+ latent_model_input, t, encoder_hidden_states=text_embeddings
431
+ ).sample
432
+
433
+ # perform guidance
434
+ if do_classifier_free_guidance:
435
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
436
+ noise_pred = noise_pred_uncond + guidance_scale * (
437
+ noise_pred_text - noise_pred_uncond
438
+ )
439
+ # compute the previous noisy sample x_t -> x_t-1
440
+ latents = self.scheduler.step(
441
+ noise_pred, t, latents, **extra_step_kwargs
442
+ ).prev_sample
443
+ # masking
444
+
445
+ noise = torch.randn(
446
+ latents.shape,
447
+ generator=generator,
448
+ device=self.device,
449
+ dtype=text_embeddings.dtype,
450
+ )
451
+ init_latents_proper = self.scheduler.add_noise(
452
+ init_latents_orig, noise, torch.tensor([t])
453
+ )
454
+
455
+ mask_t = (mask > (1 - t/1000)).type(mask.dtype) # So when t is high, most of the mask is 1 (fixed), but when t is low, most of the mask is 0 (variable)
456
+ dilate_size = (int)(4*t/1000)
457
+ mask_t_dilated = torch.nn.functional.max_pool2d(mask_t, dilate_size*2 + 1, stride=1, padding=dilate_size)
458
+ latents = (init_latents_proper * mask_t_dilated) + (latents * (1 - mask_t_dilated))
459
+
460
+ # call the callback, if provided
461
+ if i == len(timesteps) - 1 or (
462
+ (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
463
+ ):
464
+ progress_bar.update()
465
+ if callback is not None and i % callback_steps == 0:
466
+ callback(i, t, latents)
467
+
468
+ # 10. Post-processing
469
+ image = self.decode_latents(latents)
470
+
471
+ # 11. Run safety checker
472
+ image, has_nsfw_concept = self.run_safety_checker(
473
+ image, device, text_embeddings.dtype
474
+ )
475
+
476
+ # print(image.shape)
477
+
478
+ # 12. Convert to PIL
479
+ if output_type == "pil":
480
+ image = self.numpy_to_pil(image)
481
+
482
+ if not return_dict:
483
+ return (image, has_nsfw_concept)
484
+
485
+ return StableDiffusionPipelineOutput(
486
+ images=image, nsfw_content_detected=has_nsfw_concept
487
+ )
488
+
489
+ @torch.no_grad()
490
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
491
+ def __call__(
492
+ self,
493
+ prompt: Union[str, List[str]] = None,
494
+ image: PipelineImageInput = None,
495
+ height: Optional[int] = None,
496
+ width: Optional[int] = None,
497
+ num_inference_steps: int = 50,
498
+ guidance_scale: float = 7.5,
499
+ negative_prompt: Optional[Union[str, List[str]]] = None,
500
+ num_images_per_prompt: Optional[int] = 1,
501
+ eta: float = 0.0,
502
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
503
+ latents: Optional[torch.FloatTensor] = None,
504
+ prompt_embeds: Optional[torch.FloatTensor] = None,
505
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
506
+ output_type: Optional[str] = "pil",
507
+ return_dict: bool = True,
508
+ callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
509
+ callback_steps: int = 1,
510
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
511
+ controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
512
+ guess_mode: bool = False,
513
+ control_guidance_start: Union[float, List[float]] = 0.0,
514
+ control_guidance_end: Union[float, List[float]] = 1.0,
515
+ clip_skip: Optional[int] = None,
516
+ key_image = None,
517
+ key_scale = 0.5,
518
+ neg_mask = None,
519
+ neg_prompt = None,
520
+ ):
521
+ r"""
522
+ The call function to the pipeline for generation.
523
+
524
+ Args:
525
+ prompt (`str` or `List[str]`, *optional*):
526
+ The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
527
+ image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
528
+ `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
529
+ The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
530
+ specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be
531
+ accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height
532
+ and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in
533
+ `init`, images must be passed as a list such that each element of the list can be correctly batched for
534
+ input to a single ControlNet.
535
+ height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
536
+ The height in pixels of the generated image.
537
+ width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
538
+ The width in pixels of the generated image.
539
+ num_inference_steps (`int`, *optional*, defaults to 50):
540
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
541
+ expense of slower inference.
542
+ guidance_scale (`float`, *optional*, defaults to 7.5):
543
+ A higher guidance scale value encourages the model to generate images closely linked to the text
544
+ `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
545
+ negative_prompt (`str` or `List[str]`, *optional*):
546
+ The prompt or prompts to guide what to not include in image generation. If not defined, you need to
547
+ pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
548
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
549
+ The number of images to generate per prompt.
550
+ eta (`float`, *optional*, defaults to 0.0):
551
+ Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
552
+ to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
553
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
554
+ A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
555
+ generation deterministic.
556
+ latents (`torch.FloatTensor`, *optional*):
557
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
558
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
559
+ tensor is generated by sampling using the supplied random `generator`.
560
+ prompt_embeds (`torch.FloatTensor`, *optional*):
561
+ Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
562
+ provided, text embeddings are generated from the `prompt` input argument.
563
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
564
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
565
+ not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
566
+ output_type (`str`, *optional*, defaults to `"pil"`):
567
+ The output format of the generated image. Choose between `PIL.Image` or `np.array`.
568
+ return_dict (`bool`, *optional*, defaults to `True`):
569
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
570
+ plain tuple.
571
+ callback (`Callable`, *optional*):
572
+ A function that calls every `callback_steps` steps during inference. The function is called with the
573
+ following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
574
+ callback_steps (`int`, *optional*, defaults to 1):
575
+ The frequency at which the `callback` function is called. If not specified, the callback is called at
576
+ every step.
577
+ cross_attention_kwargs (`dict`, *optional*):
578
+ A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
579
+ [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
580
+ controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
581
+ The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
582
+ to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
583
+ the corresponding scale as a list.
584
+ guess_mode (`bool`, *optional*, defaults to `False`):
585
+ The ControlNet encoder tries to recognize the content of the input image even if you remove all
586
+ prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
587
+ control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
588
+ The percentage of total steps at which the ControlNet starts applying.
589
+ control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
590
+ The percentage of total steps at which the ControlNet stops applying.
591
+ clip_skip (`int`, *optional*):
592
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
593
+ the output of the pre-final layer will be used for computing the prompt embeddings.
594
+
595
+ Examples:
596
+
597
+ Returns:
598
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
599
+ If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
600
+ otherwise a `tuple` is returned where the first element is a list with the generated images and the
601
+ second element is a list of `bool`s indicating whether the corresponding generated image contains
602
+ "not-safe-for-work" (nsfw) content.
603
+ """
604
+ controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
605
+
606
+ # align format for control guidance
607
+ if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
608
+ control_guidance_start = len(control_guidance_end) * [control_guidance_start]
609
+ elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
610
+ control_guidance_end = len(control_guidance_start) * [control_guidance_end]
611
+ elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
612
+ mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
613
+ control_guidance_start, control_guidance_end = mult * [control_guidance_start], mult * [
614
+ control_guidance_end
615
+ ]
616
+
617
+ # 1. Check inputs. Raise error if not correct
618
+ self.check_inputs(
619
+ prompt,
620
+ image,
621
+ callback_steps,
622
+ negative_prompt,
623
+ prompt_embeds,
624
+ negative_prompt_embeds,
625
+ controlnet_conditioning_scale,
626
+ control_guidance_start,
627
+ control_guidance_end,
628
+ )
629
+
630
+ # 2. Define call parameters
631
+ if prompt is not None and isinstance(prompt, str):
632
+ batch_size = 1
633
+ elif prompt is not None and isinstance(prompt, list):
634
+ batch_size = len(prompt)
635
+ else:
636
+ batch_size = prompt_embeds.shape[0]
637
+
638
+ device = self._execution_device
639
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
640
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
641
+ # corresponds to doing no classifier free guidance.
642
+ do_classifier_free_guidance = guidance_scale > 1.0
643
+
644
+ if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
645
+ controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
646
+
647
+ global_pool_conditions = (
648
+ controlnet.config.global_pool_conditions
649
+ if isinstance(controlnet, ControlNetModel)
650
+ else controlnet.nets[0].config.global_pool_conditions
651
+ )
652
+ guess_mode = guess_mode or global_pool_conditions
653
+
654
+ # 3. Encode input prompt
655
+ text_encoder_lora_scale = (
656
+ cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
657
+ )
658
+ prompt_embeds, negative_prompt_embeds = self.encode_prompt(
659
+ prompt,
660
+ device,
661
+ num_images_per_prompt,
662
+ do_classifier_free_guidance,
663
+ negative_prompt,
664
+ lora_scale=text_encoder_lora_scale,
665
+ clip_skip=clip_skip,
666
+ )
667
+ # For classifier free guidance, we need to do two forward passes.
668
+ # Here we concatenate the unconditional and text embeddings into a single batch
669
+ # to avoid doing two forward passes
670
+ if do_classifier_free_guidance:
671
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
672
+ if neg_mask is not None:
673
+ empty_prompt_embeds = torch.cat([empty_negative_prompt_embeds, empty_prompt_embeds])
674
+
675
+ # 4. Prepare image
676
+ if isinstance(controlnet, ControlNetModel):
677
+ # breakpoint()
678
+ image = self.prepare_image(
679
+ image=image,
680
+ width=width,
681
+ height=height,
682
+ batch_size=batch_size * num_images_per_prompt,
683
+ num_images_per_prompt=num_images_per_prompt,
684
+ device=device,
685
+ dtype=controlnet.dtype,
686
+ do_classifier_free_guidance=do_classifier_free_guidance,
687
+ guess_mode=guess_mode,
688
+ )
689
+ height, width = image.shape[-2:]
690
+ if key_image is not None:
691
+ key_image = self.prepare_image(
692
+ image=key_image,
693
+ width=width,
694
+ height=height,
695
+ batch_size=batch_size * num_images_per_prompt,
696
+ num_images_per_prompt=num_images_per_prompt,
697
+ device=device,
698
+ dtype=controlnet.dtype,
699
+ do_classifier_free_guidance=do_classifier_free_guidance,
700
+ guess_mode=guess_mode,
701
+ )
702
+ elif isinstance(controlnet, MultiControlNetModel):
703
+ images = []
704
+ key_images = []
705
+
706
+ for image_ in image:
707
+ image_ = self.prepare_image(
708
+ image=image_,
709
+ width=width,
710
+ height=height,
711
+ batch_size=batch_size * num_images_per_prompt,
712
+ num_images_per_prompt=num_images_per_prompt,
713
+ device=device,
714
+ dtype=controlnet.dtype,
715
+ do_classifier_free_guidance=do_classifier_free_guidance,
716
+ guess_mode=guess_mode,
717
+ )
718
+
719
+ images.append(image_)
720
+
721
+ image = images
722
+ height, width = image[0].shape[-2:]
723
+ else:
724
+ assert False
725
+
726
+ # 5. Prepare timesteps
727
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
728
+ timesteps = self.scheduler.timesteps
729
+
730
+ # 6. Prepare latent variables
731
+ num_channels_latents = self.unet.config.in_channels
732
+ latents = self.prepare_latents(
733
+ batch_size * num_images_per_prompt,
734
+ num_channels_latents,
735
+ height,
736
+ width,
737
+ prompt_embeds.dtype,
738
+ device,
739
+ generator,
740
+ latents,
741
+ )
742
+
743
+ # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
744
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
745
+
746
+ # 7.1 Create tensor stating which controlnets to keep
747
+ controlnet_keep = []
748
+ for i in range(len(timesteps)):
749
+ keeps = [
750
+ 1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
751
+ for s, e in zip(control_guidance_start, control_guidance_end)
752
+ ]
753
+ controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
754
+
755
+
756
+ # 8. Denoising loop
757
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
758
+ is_unet_compiled = is_compiled_module(self.unet)
759
+ is_controlnet_compiled = is_compiled_module(self.controlnet)
760
+ is_torch_higher_equal_2_1 = is_torch_version(">=", "2.1")
761
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
762
+ for i, t in enumerate(timesteps):
763
+ # Relevant thread:
764
+ # https://dev-discuss.pytorch.org/t/cudagraphs-in-pytorch-2-0/1428
765
+ if (is_unet_compiled and is_controlnet_compiled) and is_torch_higher_equal_2_1:
766
+ torch._inductor.cudagraph_mark_step_begin()
767
+ # expand the latents if we are doing classifier free guidance
768
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
769
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
770
+
771
+ # controlnet(s) inference
772
+ if guess_mode and do_classifier_free_guidance:
773
+ # Infer ControlNet only for the conditional batch.
774
+ control_model_input = latents
775
+ control_model_input = self.scheduler.scale_model_input(control_model_input, t)
776
+ controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
777
+ else:
778
+ control_model_input = latent_model_input
779
+ controlnet_prompt_embeds = prompt_embeds
780
+
781
+ if isinstance(controlnet_keep[i], list):
782
+ cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
783
+ else:
784
+ controlnet_cond_scale = controlnet_conditioning_scale
785
+ if isinstance(controlnet_cond_scale, list):
786
+ controlnet_cond_scale = controlnet_cond_scale[0]
787
+ cond_scale = controlnet_cond_scale * controlnet_keep[i]
788
+
789
+ down_block_res_samples, mid_block_res_sample = self.controlnet(
790
+ control_model_input,
791
+ t,
792
+ encoder_hidden_states=controlnet_prompt_embeds,
793
+ controlnet_cond=image,
794
+ conditioning_scale=cond_scale,
795
+ guess_mode=guess_mode,
796
+ return_dict=False,
797
+ )
798
+
799
+ if guess_mode and do_classifier_free_guidance:
800
+ # Infered ControlNet only for the conditional batch.
801
+ # To apply the output of ControlNet to both the unconditional and conditional batches,
802
+ # add 0 to the unconditional batch to keep it unchanged.
803
+ down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
804
+ mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
805
+
806
+ # predict the noise residual
807
+ noise_pred = self.unet(
808
+ latent_model_input,
809
+ t,
810
+ encoder_hidden_states=prompt_embeds,
811
+ cross_attention_kwargs=cross_attention_kwargs,
812
+ down_block_additional_residuals=down_block_res_samples,
813
+ mid_block_additional_residual=mid_block_res_sample,
814
+ return_dict=False,
815
+ )[0]
816
+
817
+ # perform guidance
818
+ if do_classifier_free_guidance:
819
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
820
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
821
+
822
+ # compute the previous noisy sample x_t -> x_t-1
823
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
824
+
825
+ # call the callback, if provided
826
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
827
+ progress_bar.update()
828
+ if callback is not None and i % callback_steps == 0:
829
+ step_idx = i // getattr(self.scheduler, "order", 1)
830
+ callback(step_idx, t, latents)
831
+
832
+ # If we do sequential model offloading, let's offload unet and controlnet
833
+ # manually for max memory savings
834
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
835
+ self.unet.to("cpu")
836
+ self.controlnet.to("cpu")
837
+ torch.cuda.empty_cache()
838
+
839
+ if not output_type == "latent":
840
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
841
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
842
+ else:
843
+ image = latents
844
+ has_nsfw_concept = None
845
+
846
+ if has_nsfw_concept is None:
847
+ do_denormalize = [True] * image.shape[0]
848
+ else:
849
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
850
+
851
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
852
+
853
+ # Offload all models
854
+ self.maybe_free_model_hooks()
855
+
856
+ if not return_dict:
857
+ return (image, has_nsfw_concept)
858
+
859
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
requirements.txt CHANGED
@@ -1,33 +1,8 @@
1
- requests==2.31.0
2
- setuptools==68.2.2
3
- yaml==0.2.5
4
- accelerate==0.24.1
5
  diffusers==0.23.1
6
- einops==0.3.0
7
- flask==3.0.0
8
- flask-cors==4.0.0
9
  gradio==3.48.0
10
- gradio-client==0.6.1
11
- imageio==2.9.0
12
- imageio-ffmpeg==0.4.2
13
- matplotlib==3.7.3
14
- multiprocess==0.70.15
15
- omegaconf==2.3.0
16
- opencv-contrib-python==4.3.0.36
17
- opencv-python==4.8.1.78
18
- opencv-python-headless==4.8.1.78
19
  pillow==9.4.0
20
- pytorch-lightning==1.5.0
21
- safetensors==0.4.0
22
- scikit-image==0.20.0
23
- scikit-learn==1.3.1
24
- scipy==1.9.1
25
- threadpoolctl==3.2.0
26
- tokenizers==0.14.1
27
  torch==2.1.1
28
  torchmetrics==0.6.0
29
  torchvision==0.16.1
30
- tqdm==4.66.1
31
- transformers==4.34.1
32
- watchdog==3.0.0
33
- open-clip-torch==2.0.2
 
 
 
 
 
1
  diffusers==0.23.1
 
 
 
2
  gradio==3.48.0
 
 
 
 
 
 
 
 
 
3
  pillow==9.4.0
 
 
 
 
 
 
 
4
  torch==2.1.1
5
  torchmetrics==0.6.0
6
  torchvision==0.16.1
7
+ numpy
8
+ inspect