deinferno commited on
Commit
8943a6e
1 Parent(s): 2c9efe6

Update scheduler and pipeline code to be closer to diffusers version

Browse files

Update scheduler config to latest version
Create and use torch.Generator seeded from np.random.RandomState to fix custom seeds
Properly override __init__ with edited OVModelUnet
Use official _encode_prompt
Rename get_w_embedding to get_guidance_scale_embedding and use guidance_scale-1
Update README.md

README.md CHANGED
@@ -73,5 +73,5 @@ num_inference_steps = 4
73
  pipe.reshape(batch_size=batch_size, height=height, width=width, num_images_per_prompt=num_images)
74
  pipe.compile()
75
 
76
- images = pipe(prompt=prompt, width=width, height=height, num_inference_steps=num_inference_steps, guidance_scale=8.0, lcm_origin_steps=50, output_type="pil").images
77
  ```
 
73
  pipe.reshape(batch_size=batch_size, height=height, width=width, num_images_per_prompt=num_images)
74
  pipe.compile()
75
 
76
+ images = pipe(prompt=prompt, width=width, height=height, num_inference_steps=num_inference_steps, guidance_scale=8.0, output_type="pil").images
77
  ```
lcm_ov_pipeline.py CHANGED
@@ -9,7 +9,15 @@ import openvino
9
  import torch
10
 
11
  from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
12
- from optimum.intel.openvino.modeling_diffusion import OVStableDiffusionPipeline, OVModelUnet
 
 
 
 
 
 
 
 
13
 
14
  from diffusers import logging
15
  logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -43,6 +51,7 @@ class LCMOVModelUnet(OVModelUnet):
43
  return list(outputs.values())
44
 
45
  class OVLatentConsistencyModelPipeline(OVStableDiffusionPipeline):
 
46
  def __init__(
47
  self,
48
  vae_decoder: openvino.runtime.Model,
@@ -62,9 +71,56 @@ class OVLatentConsistencyModelPipeline(OVStableDiffusionPipeline):
62
  model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
63
  **kwargs,
64
  ):
65
- super().__init__(vae_decoder, text_encoder, unet, config, tokenizer, scheduler, feature_extractor, vae_encoder, text_encoder_2, tokenizer_2, device, dynamic_shapes, compile, ov_config, model_save_dir, **kwargs)
66
-
 
 
 
 
 
 
67
  self.unet = LCMOVModelUnet(unet, self)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
  def _reshape_unet(
70
  self,
@@ -110,63 +166,7 @@ class OVLatentConsistencyModelPipeline(OVStableDiffusionPipeline):
110
  model.reshape(shapes)
111
  return model
112
 
113
-
114
- def _encode_prompt(
115
- self,
116
- prompt: Union[str, List[str]],
117
- num_images_per_prompt: Optional[int],
118
- prompt_embeds: Optional[np.ndarray] = None,
119
- ):
120
- r"""
121
- Encodes the prompt into text encoder hidden states.
122
-
123
- Args:
124
- prompt (`str` or `List[str]`):
125
- prompt to be encoded
126
- num_images_per_prompt (`int`):
127
- number of images that should be generated per prompt
128
- prompt_embeds (`np.ndarray`, *optional*):
129
- Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
130
- provided, text embeddings will be generated from `prompt` input argument.
131
- """
132
- if prompt is not None and isinstance(prompt, str):
133
- batch_size = 1
134
- elif prompt is not None and isinstance(prompt, list):
135
- batch_size = len(prompt)
136
- else:
137
- batch_size = prompt_embeds.shape[0]
138
-
139
- if prompt_embeds is None:
140
- # get prompt text embeddings
141
- text_inputs = self.tokenizer(
142
- prompt,
143
- padding="max_length",
144
- max_length=self.tokenizer.model_max_length,
145
- truncation=True,
146
- return_tensors="np",
147
- )
148
- text_input_ids = text_inputs.input_ids
149
- untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="np").input_ids
150
-
151
- if not np.array_equal(text_input_ids, untruncated_ids):
152
- removed_text = self.tokenizer.batch_decode(
153
- untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
154
- )
155
- logger.warning(
156
- "The following part of your input was truncated because CLIP can only handle sequences up to"
157
- f" {self.tokenizer.model_max_length} tokens: {removed_text}"
158
- )
159
-
160
- prompt_embeds = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
161
-
162
- bs_embed, seq_len, _ = prompt_embeds.shape
163
-
164
- prompt_embeds = np.tile(prompt_embeds, [1, num_images_per_prompt, 1])
165
- prompt_embeds = np.reshape(prompt_embeds, [bs_embed * num_images_per_prompt, seq_len, -1])
166
-
167
- return prompt_embeds
168
-
169
- def get_w_embedding(self, w, embedding_dim=512, dtype=np.float32):
170
  """
171
  see https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
172
  Args:
@@ -197,7 +197,7 @@ class OVLatentConsistencyModelPipeline(OVStableDiffusionPipeline):
197
  height: Optional[int] = None,
198
  width: Optional[int] = None,
199
  num_inference_steps: int = 4,
200
- lcm_origin_steps: int = 50,
201
  guidance_scale: float = 7.5,
202
  num_images_per_prompt: int = 1,
203
  eta: float = 0.0,
@@ -224,8 +224,11 @@ class OVLatentConsistencyModelPipeline(OVStableDiffusionPipeline):
224
  num_inference_steps (`int`, defaults to 4):
225
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
226
  expense of slower inference.
227
- lcm_origin_steps (`int`, defaults to 50):
228
- The number of LCM Scheduler denoising steps.
 
 
 
229
  guidance_scale (`float`, defaults to 7.5):
230
  Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
231
  `guidance_scale` is defined as `w` of equation 2. of [Imagen
@@ -290,14 +293,25 @@ class OVLatentConsistencyModelPipeline(OVStableDiffusionPipeline):
290
  if generator is None:
291
  generator = np.random
292
 
 
 
 
 
 
 
 
 
293
  prompt_embeds = self._encode_prompt(
294
  prompt,
295
  num_images_per_prompt,
 
 
296
  prompt_embeds=prompt_embeds,
 
297
  )
298
 
299
  # set timesteps
300
- self.scheduler.set_timesteps(num_inference_steps, lcm_origin_steps)
301
  timesteps = self.scheduler.timesteps
302
 
303
  latents = self.prepare_latents(
@@ -310,6 +324,13 @@ class OVLatentConsistencyModelPipeline(OVStableDiffusionPipeline):
310
  latents,
311
  )
312
 
 
 
 
 
 
 
 
313
  # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
314
  # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
315
  # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
@@ -319,13 +340,9 @@ class OVLatentConsistencyModelPipeline(OVStableDiffusionPipeline):
319
  if accepts_eta:
320
  extra_step_kwargs["eta"] = eta
321
 
322
- # Adapted from diffusers to extend it for other runtimes than ORT
323
- timestep_dtype = self.unet.input_dtype.get("timestep", np.float32)
324
-
325
- # Get Guidance Scale Embedding
326
- w = np.tile(guidance_scale, batch_size * num_images_per_prompt)
327
- w_embedding = self.get_w_embedding(w, embedding_dim=self.unet.config.get("time_cond_proj_dim", 256))
328
-
329
 
330
  num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
331
  for i, t in enumerate(self.progress_bar(timesteps)):
@@ -333,11 +350,11 @@ class OVLatentConsistencyModelPipeline(OVStableDiffusionPipeline):
333
  # predict the noise residual
334
  timestep = np.array([t], dtype=timestep_dtype)
335
 
336
- noise_pred = self.unet(sample=latents, timestep=timestep, encoder_hidden_states=prompt_embeds, timestep_cond = w_embedding)[0]
337
 
338
  # compute the previous noisy sample x_t -> x_t-1
339
  latents, denoised = self.scheduler.step(
340
- torch.from_numpy(noise_pred), i, t, torch.from_numpy(latents), **extra_step_kwargs, return_dict = False
341
  )
342
 
343
  latents, denoised = latents.numpy(), denoised.numpy()
 
9
  import torch
10
 
11
  from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
12
+ from optimum.intel.openvino.modeling_diffusion import OVStableDiffusionPipeline, OVModelUnet, OVModelVaeDecoder, OVModelTextEncoder, OVModelVaeEncoder, VaeImageProcessor
13
+ from optimum.utils import (
14
+ DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER,
15
+ DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER,
16
+ DIFFUSION_MODEL_UNET_SUBFOLDER,
17
+ DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER,
18
+ DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER,
19
+ )
20
+
21
 
22
  from diffusers import logging
23
  logger = logging.get_logger(__name__) # pylint: disable=invalid-name
 
51
  return list(outputs.values())
52
 
53
  class OVLatentConsistencyModelPipeline(OVStableDiffusionPipeline):
54
+
55
  def __init__(
56
  self,
57
  vae_decoder: openvino.runtime.Model,
 
71
  model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
72
  **kwargs,
73
  ):
74
+ self._internal_dict = config
75
+ self._device = device.upper()
76
+ self.is_dynamic = dynamic_shapes
77
+ self.ov_config = ov_config if ov_config is not None else {}
78
+ self._model_save_dir = (
79
+ Path(model_save_dir.name) if isinstance(model_save_dir, TemporaryDirectory) else model_save_dir
80
+ )
81
+ self.vae_decoder = OVModelVaeDecoder(vae_decoder, self)
82
  self.unet = LCMOVModelUnet(unet, self)
83
+ self.text_encoder = OVModelTextEncoder(text_encoder, self) if text_encoder is not None else None
84
+ self.text_encoder_2 = (
85
+ OVModelTextEncoder(text_encoder_2, self, model_name=DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER)
86
+ if text_encoder_2 is not None
87
+ else None
88
+ )
89
+ self.vae_encoder = OVModelVaeEncoder(vae_encoder, self) if vae_encoder is not None else None
90
+
91
+ if "block_out_channels" in self.vae_decoder.config:
92
+ self.vae_scale_factor = 2 ** (len(self.vae_decoder.config["block_out_channels"]) - 1)
93
+ else:
94
+ self.vae_scale_factor = 8
95
+
96
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
97
+
98
+ self.tokenizer = tokenizer
99
+ self.tokenizer_2 = tokenizer_2
100
+ self.scheduler = scheduler
101
+ self.feature_extractor = feature_extractor
102
+ self.safety_checker = None
103
+ self.preprocessors = []
104
+
105
+ if self.is_dynamic:
106
+ self.reshape(batch_size=-1, height=-1, width=-1, num_images_per_prompt=-1)
107
+
108
+ if compile:
109
+ self.compile()
110
+
111
+ sub_models = {
112
+ DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER: self.text_encoder,
113
+ DIFFUSION_MODEL_UNET_SUBFOLDER: self.unet,
114
+ DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER: self.vae_decoder,
115
+ DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER: self.vae_encoder,
116
+ DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER: self.text_encoder_2,
117
+ }
118
+ for name in sub_models.keys():
119
+ self._internal_dict[name] = (
120
+ ("optimum", sub_models[name].__class__.__name__) if sub_models[name] is not None else (None, None)
121
+ )
122
+
123
+ self._internal_dict.pop("vae", None)
124
 
125
  def _reshape_unet(
126
  self,
 
166
  model.reshape(shapes)
167
  return model
168
 
169
+ def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=np.float32):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  """
171
  see https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
172
  Args:
 
197
  height: Optional[int] = None,
198
  width: Optional[int] = None,
199
  num_inference_steps: int = 4,
200
+ original_inference_steps: int = None,
201
  guidance_scale: float = 7.5,
202
  num_images_per_prompt: int = 1,
203
  eta: float = 0.0,
 
224
  num_inference_steps (`int`, defaults to 4):
225
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
226
  expense of slower inference.
227
+ original_inference_steps (`int`, *optional*):
228
+ The original number of inference steps use to generate a linearly-spaced timestep schedule, from which
229
+ we will draw `num_inference_steps` evenly spaced timesteps from as our final timestep schedule,
230
+ following the Skipping-Step method in the paper (see Section 4.3). If not set this will default to the
231
+ scheduler's `original_inference_steps` attribute.
232
  guidance_scale (`float`, defaults to 7.5):
233
  Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
234
  `guidance_scale` is defined as `w` of equation 2. of [Imagen
 
293
  if generator is None:
294
  generator = np.random
295
 
296
+ # Create torch.Generator instance with same state as np.random.RandomState
297
+ torch_generator = torch.Generator().manual_seed(int(generator.get_state()[1][0]))
298
+
299
+ #do_classifier_free_guidance = guidance_scale > 1.0
300
+
301
+ # NOTE: when a LCM is distilled from an LDM via latent consistency distillation (Algorithm 1) with guided
302
+ # distillation, the forward pass of the LCM learns to approximate sampling from the LDM using CFG with the
303
+ # unconditional prompt "" (the empty string). Due to this, LCMs currently do not support negative prompts.
304
  prompt_embeds = self._encode_prompt(
305
  prompt,
306
  num_images_per_prompt,
307
+ False,
308
+ negative_prompt=None,
309
  prompt_embeds=prompt_embeds,
310
+ negative_prompt_embeds=None,
311
  )
312
 
313
  # set timesteps
314
+ self.scheduler.set_timesteps(num_inference_steps, "cpu", original_inference_steps=original_inference_steps)
315
  timesteps = self.scheduler.timesteps
316
 
317
  latents = self.prepare_latents(
 
324
  latents,
325
  )
326
 
327
+ # Get Guidance Scale Embedding
328
+ w = np.tile(guidance_scale - 1, batch_size * num_images_per_prompt)
329
+ w_embedding = self.get_guidance_scale_embedding(w, embedding_dim=self.unet.config.get("time_cond_proj_dim", 256))
330
+
331
+ # Adapted from diffusers to extend it for other runtimes than ORT
332
+ timestep_dtype = self.unet.input_dtype.get("timestep", np.float32)
333
+
334
  # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
335
  # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
336
  # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
 
340
  if accepts_eta:
341
  extra_step_kwargs["eta"] = eta
342
 
343
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
344
+ if accepts_generator:
345
+ extra_step_kwargs["generator"] = torch_generator
 
 
 
 
346
 
347
  num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
348
  for i, t in enumerate(self.progress_bar(timesteps)):
 
350
  # predict the noise residual
351
  timestep = np.array([t], dtype=timestep_dtype)
352
 
353
+ noise_pred = self.unet(sample=latents, timestep=timestep, timestep_cond = w_embedding, encoder_hidden_states=prompt_embeds)[0]
354
 
355
  # compute the previous noisy sample x_t -> x_t-1
356
  latents, denoised = self.scheduler.step(
357
+ torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs, return_dict = False
358
  )
359
 
360
  latents, denoised = latents.numpy(), denoised.numpy()
lcm_scheduler.py CHANGED
@@ -22,13 +22,16 @@ from typing import List, Optional, Tuple, Union
22
  import numpy as np
23
  import torch
24
 
25
- from diffusers import ConfigMixin, SchedulerMixin
26
- from diffusers.configuration_utils import register_to_config
27
- from diffusers.utils import BaseOutput
 
 
 
 
28
 
29
 
30
  @dataclass
31
- # Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM
32
  class LCMSchedulerOutput(BaseOutput):
33
  """
34
  Output class for the scheduler's `step` function output.
@@ -91,7 +94,8 @@ def betas_for_alpha_bar(
91
  return torch.tensor(betas, dtype=torch.float32)
92
 
93
 
94
- def rescale_zero_terminal_snr(betas):
 
95
  """
96
  Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
97
 
@@ -132,8 +136,10 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
132
  `LCMScheduler` extends the denoising procedure introduced in denoising diffusion probabilistic models (DDPMs) with
133
  non-Markovian guidance.
134
 
135
- This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
136
- methods the library implements for all schedulers such as loading and saving.
 
 
137
 
138
  Args:
139
  num_train_timesteps (`int`, defaults to 1000):
@@ -147,6 +153,9 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
147
  `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
148
  trained_betas (`np.ndarray`, *optional*):
149
  Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
 
 
 
150
  clip_sample (`bool`, defaults to `True`):
151
  Clip the predicted sample for numerical stability.
152
  clip_sample_range (`float`, defaults to 1.0):
@@ -179,24 +188,24 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
179
  [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
180
  """
181
 
182
- # _compatibles = [e.name for e in KarrasDiffusionSchedulers]
183
  order = 1
184
 
185
  @register_to_config
186
  def __init__(
187
  self,
188
  num_train_timesteps: int = 1000,
189
- beta_start: float = 0.0001,
190
- beta_end: float = 0.02,
191
- beta_schedule: str = "linear",
192
  trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
193
- clip_sample: bool = True,
 
 
194
  set_alpha_to_one: bool = True,
195
  steps_offset: int = 0,
196
  prediction_type: str = "epsilon",
197
  thresholding: bool = False,
198
  dynamic_thresholding_ratio: float = 0.995,
199
- clip_sample_range: float = 1.0,
200
  sample_max_value: float = 1.0,
201
  timestep_spacing: str = "leading",
202
  rescale_betas_zero_snr: bool = False,
@@ -236,6 +245,30 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
236
  self.num_inference_steps = None
237
  self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))
238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
240
  """
241
  Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
@@ -246,23 +279,12 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
246
  The input sample.
247
  timestep (`int`, *optional*):
248
  The current timestep in the diffusion chain.
249
-
250
  Returns:
251
  `torch.FloatTensor`:
252
  A scaled input sample.
253
  """
254
  return sample
255
 
256
- def _get_variance(self, timestep, prev_timestep):
257
- alpha_prod_t = self.alphas_cumprod[timestep]
258
- alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
259
- beta_prod_t = 1 - alpha_prod_t
260
- beta_prod_t_prev = 1 - alpha_prod_t_prev
261
-
262
- variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
263
-
264
- return variance
265
-
266
  # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
267
  def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
268
  """
@@ -275,13 +297,13 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
275
  https://arxiv.org/abs/2205.11487
276
  """
277
  dtype = sample.dtype
278
- batch_size, channels, height, width = sample.shape
279
 
280
  if dtype not in (torch.float32, torch.float64):
281
  sample = sample.float() # upcast for quantile calculation, and clamp not implemented for cpu half
282
 
283
  # Flatten sample for doing quantile calculation along each image
284
- sample = sample.reshape(batch_size, channels * height * width)
285
 
286
  abs_sample = sample.abs() # "a certain percentile absolute pixel value"
287
 
@@ -289,22 +311,33 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
289
  s = torch.clamp(
290
  s, min=1, max=self.config.sample_max_value
291
  ) # When clamped to min=1, equivalent to standard clipping to [-1, 1]
292
-
293
  s = s.unsqueeze(1) # (batch_size, 1) because clamp will broadcast along dim=0
294
  sample = torch.clamp(sample, -s, s) / s # "we threshold xt0 to the range [-s, s] and then divide by s"
295
 
296
- sample = sample.reshape(batch_size, channels, height, width)
297
  sample = sample.to(dtype)
298
 
299
  return sample
300
 
301
- def set_timesteps(self, num_inference_steps: int, lcm_origin_steps: int, device: Union[str, torch.device] = None):
 
 
 
 
 
302
  """
303
  Sets the discrete timesteps used for the diffusion chain (to be run before inference).
304
 
305
  Args:
306
  num_inference_steps (`int`):
307
  The number of diffusion steps used when generating samples with a pre-trained model.
 
 
 
 
 
 
 
308
  """
309
 
310
  if num_inference_steps > self.config.num_train_timesteps:
@@ -315,36 +348,51 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
315
  )
316
 
317
  self.num_inference_steps = num_inference_steps
318
-
319
- # LCM Timesteps Setting: # Linear Spacing
320
- c = self.config.num_train_timesteps // lcm_origin_steps
321
- lcm_origin_timesteps = np.asarray(list(range(1, lcm_origin_steps + 1))) * c - 1 # LCM Training Steps Schedule
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
  skipping_step = len(lcm_origin_timesteps) // num_inference_steps
323
- timesteps = lcm_origin_timesteps[::-skipping_step][:num_inference_steps] # LCM Inference Steps Schedule
324
-
325
- self.timesteps = torch.from_numpy(timesteps.copy()).to(device)
 
 
 
326
 
327
  def get_scalings_for_boundary_condition_discrete(self, t):
328
- self.sigma_data = 0.5 # Default: 0.5
329
-
330
- # By dividing 0.1: This is almost a delta function at t=0.
331
- c_skip = self.sigma_data**2 / (
332
- (t / 0.1) ** 2 + self.sigma_data**2
333
- )
334
- c_out = (( t / 0.1) / ((t / 0.1) **2 + self.sigma_data**2) ** 0.5)
335
  return c_skip, c_out
336
-
337
-
338
  def step(
339
  self,
340
  model_output: torch.FloatTensor,
341
- timeindex: int,
342
  timestep: int,
343
  sample: torch.FloatTensor,
344
- eta: float = 0.0,
345
- use_clipped_model_output: bool = False,
346
- generator=None,
347
- variance_noise: Optional[torch.FloatTensor] = None,
348
  return_dict: bool = True,
349
  ) -> Union[LCMSchedulerOutput, Tuple]:
350
  """
@@ -358,77 +406,79 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
358
  The current discrete timestep in the diffusion chain.
359
  sample (`torch.FloatTensor`):
360
  A current instance of a sample created by the diffusion process.
361
- eta (`float`):
362
- The weight of noise for added noise in diffusion step.
363
- use_clipped_model_output (`bool`, defaults to `False`):
364
- If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary
365
- because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no
366
- clipping has happened, "corrected" `model_output` would coincide with the one provided as input and
367
- `use_clipped_model_output` has no effect.
368
  generator (`torch.Generator`, *optional*):
369
  A random number generator.
370
- variance_noise (`torch.FloatTensor`):
371
- Alternative to generating noise with `generator` by directly providing the noise for the variance
372
- itself. Useful for methods such as [`CycleDiffusion`].
373
  return_dict (`bool`, *optional*, defaults to `True`):
374
  Whether or not to return a [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] or `tuple`.
375
-
376
  Returns:
377
  [`~schedulers.scheduling_utils.LCMSchedulerOutput`] or `tuple`:
378
  If return_dict is `True`, [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] is returned, otherwise a
379
  tuple is returned where the first element is the sample tensor.
380
-
381
  """
382
  if self.num_inference_steps is None:
383
  raise ValueError(
384
  "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
385
  )
386
-
 
 
 
387
  # 1. get previous step value
388
- prev_timeindex = timeindex + 1
389
- if prev_timeindex < len(self.timesteps):
390
- prev_timestep = self.timesteps[prev_timeindex]
391
  else:
392
  prev_timestep = timestep
393
-
394
  # 2. compute alphas, betas
395
  alpha_prod_t = self.alphas_cumprod[timestep]
396
  alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
397
-
398
  beta_prod_t = 1 - alpha_prod_t
399
  beta_prod_t_prev = 1 - alpha_prod_t_prev
400
-
401
  # 3. Get scalings for boundary conditions
402
  c_skip, c_out = self.get_scalings_for_boundary_condition_discrete(timestep)
403
-
404
- # 4. Different Parameterization:
405
- parameterization = self.config.prediction_type
406
-
407
- if parameterization == "epsilon": # noise-prediction
408
- pred_x0 = (sample - beta_prod_t.sqrt() * model_output) / alpha_prod_t.sqrt()
409
-
410
- elif parameterization == "sample": # x-prediction
411
- pred_x0 = model_output
412
-
413
- elif parameterization == "v_prediction": # v-prediction
414
- pred_x0 = alpha_prod_t.sqrt() * sample - beta_prod_t.sqrt() * model_output
415
-
416
- # 4. Denoise model output using boundary conditions
417
- denoised = c_out * pred_x0 + c_skip * sample
418
-
419
- # 5. Sample z ~ N(0, I), For MultiStep Inference
 
 
 
 
 
 
 
 
 
420
  # Noise is not used for one-step sampling.
421
  if len(self.timesteps) > 1:
422
- noise = torch.randn(model_output.shape).to(model_output.device)
423
  prev_sample = alpha_prod_t_prev.sqrt() * denoised + beta_prod_t_prev.sqrt() * noise
424
  else:
425
  prev_sample = denoised
426
-
 
 
 
427
  if not return_dict:
428
  return (prev_sample, denoised)
429
-
430
  return LCMSchedulerOutput(prev_sample=prev_sample, denoised=denoised)
431
-
432
 
433
  # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
434
  def add_noise(
 
22
  import numpy as np
23
  import torch
24
 
25
+ from diffusers.configuration_utils import ConfigMixin, register_to_config
26
+ from diffusers.utils import BaseOutput, logging
27
+ from diffusers.utils.torch_utils import randn_tensor
28
+ from diffusers.schedulers.scheduling_utils import SchedulerMixin
29
+
30
+
31
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
32
 
33
 
34
  @dataclass
 
35
  class LCMSchedulerOutput(BaseOutput):
36
  """
37
  Output class for the scheduler's `step` function output.
 
94
  return torch.tensor(betas, dtype=torch.float32)
95
 
96
 
97
+ # Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
98
+ def rescale_zero_terminal_snr(betas: torch.FloatTensor) -> torch.FloatTensor:
99
  """
100
  Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
101
 
 
136
  `LCMScheduler` extends the denoising procedure introduced in denoising diffusion probabilistic models (DDPMs) with
137
  non-Markovian guidance.
138
 
139
+ This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. [`~ConfigMixin`] takes care of storing all config
140
+ attributes that are passed in the scheduler's `__init__` function, such as `num_train_timesteps`. They can be
141
+ accessed via `scheduler.config.num_train_timesteps`. [`SchedulerMixin`] provides general loading and saving
142
+ functionality via the [`SchedulerMixin.save_pretrained`] and [`~SchedulerMixin.from_pretrained`] functions.
143
 
144
  Args:
145
  num_train_timesteps (`int`, defaults to 1000):
 
153
  `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
154
  trained_betas (`np.ndarray`, *optional*):
155
  Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
156
+ original_inference_steps (`int`, *optional*, defaults to 50):
157
+ The default number of inference steps used to generate a linearly-spaced timestep schedule, from which we
158
+ will ultimately take `num_inference_steps` evenly spaced timesteps to form the final timestep schedule.
159
  clip_sample (`bool`, defaults to `True`):
160
  Clip the predicted sample for numerical stability.
161
  clip_sample_range (`float`, defaults to 1.0):
 
188
  [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
189
  """
190
 
 
191
  order = 1
192
 
193
  @register_to_config
194
  def __init__(
195
  self,
196
  num_train_timesteps: int = 1000,
197
+ beta_start: float = 0.00085,
198
+ beta_end: float = 0.012,
199
+ beta_schedule: str = "scaled_linear",
200
  trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
201
+ original_inference_steps: int = 50,
202
+ clip_sample: bool = False,
203
+ clip_sample_range: float = 1.0,
204
  set_alpha_to_one: bool = True,
205
  steps_offset: int = 0,
206
  prediction_type: str = "epsilon",
207
  thresholding: bool = False,
208
  dynamic_thresholding_ratio: float = 0.995,
 
209
  sample_max_value: float = 1.0,
210
  timestep_spacing: str = "leading",
211
  rescale_betas_zero_snr: bool = False,
 
245
  self.num_inference_steps = None
246
  self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))
247
 
248
+ self._step_index = None
249
+
250
+ # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
251
+ def _init_step_index(self, timestep):
252
+ if isinstance(timestep, torch.Tensor):
253
+ timestep = timestep.to(self.timesteps.device)
254
+
255
+ index_candidates = (self.timesteps == timestep).nonzero()
256
+
257
+ # The sigma index that is taken for the **very** first `step`
258
+ # is always the second index (or the last index if there is only 1)
259
+ # This way we can ensure we don't accidentally skip a sigma in
260
+ # case we start in the middle of the denoising schedule (e.g. for image-to-image)
261
+ if len(index_candidates) > 1:
262
+ step_index = index_candidates[1]
263
+ else:
264
+ step_index = index_candidates[0]
265
+
266
+ self._step_index = step_index.item()
267
+
268
+ @property
269
+ def step_index(self):
270
+ return self._step_index
271
+
272
  def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
273
  """
274
  Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
 
279
  The input sample.
280
  timestep (`int`, *optional*):
281
  The current timestep in the diffusion chain.
 
282
  Returns:
283
  `torch.FloatTensor`:
284
  A scaled input sample.
285
  """
286
  return sample
287
 
 
 
 
 
 
 
 
 
 
 
288
  # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
289
  def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
290
  """
 
297
  https://arxiv.org/abs/2205.11487
298
  """
299
  dtype = sample.dtype
300
+ batch_size, channels, *remaining_dims = sample.shape
301
 
302
  if dtype not in (torch.float32, torch.float64):
303
  sample = sample.float() # upcast for quantile calculation, and clamp not implemented for cpu half
304
 
305
  # Flatten sample for doing quantile calculation along each image
306
+ sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
307
 
308
  abs_sample = sample.abs() # "a certain percentile absolute pixel value"
309
 
 
311
  s = torch.clamp(
312
  s, min=1, max=self.config.sample_max_value
313
  ) # When clamped to min=1, equivalent to standard clipping to [-1, 1]
 
314
  s = s.unsqueeze(1) # (batch_size, 1) because clamp will broadcast along dim=0
315
  sample = torch.clamp(sample, -s, s) / s # "we threshold xt0 to the range [-s, s] and then divide by s"
316
 
317
+ sample = sample.reshape(batch_size, channels, *remaining_dims)
318
  sample = sample.to(dtype)
319
 
320
  return sample
321
 
322
+ def set_timesteps(
323
+ self,
324
+ num_inference_steps: int,
325
+ device: Union[str, torch.device] = None,
326
+ original_inference_steps: Optional[int] = None,
327
+ ):
328
  """
329
  Sets the discrete timesteps used for the diffusion chain (to be run before inference).
330
 
331
  Args:
332
  num_inference_steps (`int`):
333
  The number of diffusion steps used when generating samples with a pre-trained model.
334
+ device (`str` or `torch.device`, *optional*):
335
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
336
+ original_inference_steps (`int`, *optional*):
337
+ The original number of inference steps, which will be used to generate a linearly-spaced timestep
338
+ schedule (which is different from the standard `diffusers` implementation). We will then take
339
+ `num_inference_steps` timesteps from this schedule, evenly spaced in terms of indices, and use that as
340
+ our final timestep schedule. If not set, this will default to the `original_inference_steps` attribute.
341
  """
342
 
343
  if num_inference_steps > self.config.num_train_timesteps:
 
348
  )
349
 
350
  self.num_inference_steps = num_inference_steps
351
+ original_steps = (
352
+ original_inference_steps if original_inference_steps is not None else self.original_inference_steps
353
+ )
354
+
355
+ if original_steps > self.config.num_train_timesteps:
356
+ raise ValueError(
357
+ f"`original_steps`: {original_steps} cannot be larger than `self.config.train_timesteps`:"
358
+ f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
359
+ f" maximal {self.config.num_train_timesteps} timesteps."
360
+ )
361
+
362
+ if num_inference_steps > original_steps:
363
+ raise ValueError(
364
+ f"`num_inference_steps`: {num_inference_steps} cannot be larger than `original_inference_steps`:"
365
+ f" {original_steps} because the final timestep schedule will be a subset of the"
366
+ f" `original_inference_steps`-sized initial timestep schedule."
367
+ )
368
+
369
+ # LCM Timesteps Setting
370
+ # Currently, only linear spacing is supported.
371
+ c = self.config.num_train_timesteps // original_steps
372
+ # LCM Training Steps Schedule
373
+ lcm_origin_timesteps = np.asarray(list(range(1, original_steps + 1))) * c - 1
374
  skipping_step = len(lcm_origin_timesteps) // num_inference_steps
375
+ # LCM Inference Steps Schedule
376
+ timesteps = lcm_origin_timesteps[::-skipping_step][:num_inference_steps]
377
+
378
+ self.timesteps = torch.from_numpy(timesteps.copy()).to(device=device, dtype=torch.long)
379
+
380
+ self._step_index = None
381
 
382
  def get_scalings_for_boundary_condition_discrete(self, t):
383
+ self.sigma_data = 0.5 # Default: 0.5
384
+
385
+ # By dividing 0.1: This is almost a delta function at t=0.
386
+ c_skip = self.sigma_data**2 / ((t / 0.1) ** 2 + self.sigma_data**2)
387
+ c_out = (t / 0.1) / ((t / 0.1) ** 2 + self.sigma_data**2) ** 0.5
 
 
388
  return c_skip, c_out
389
+
 
390
  def step(
391
  self,
392
  model_output: torch.FloatTensor,
 
393
  timestep: int,
394
  sample: torch.FloatTensor,
395
+ generator: Optional[torch.Generator] = None,
 
 
 
396
  return_dict: bool = True,
397
  ) -> Union[LCMSchedulerOutput, Tuple]:
398
  """
 
406
  The current discrete timestep in the diffusion chain.
407
  sample (`torch.FloatTensor`):
408
  A current instance of a sample created by the diffusion process.
 
 
 
 
 
 
 
409
  generator (`torch.Generator`, *optional*):
410
  A random number generator.
 
 
 
411
  return_dict (`bool`, *optional*, defaults to `True`):
412
  Whether or not to return a [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] or `tuple`.
 
413
  Returns:
414
  [`~schedulers.scheduling_utils.LCMSchedulerOutput`] or `tuple`:
415
  If return_dict is `True`, [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] is returned, otherwise a
416
  tuple is returned where the first element is the sample tensor.
 
417
  """
418
  if self.num_inference_steps is None:
419
  raise ValueError(
420
  "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
421
  )
422
+
423
+ if self.step_index is None:
424
+ self._init_step_index(timestep)
425
+
426
  # 1. get previous step value
427
+ prev_step_index = self.step_index + 1
428
+ if prev_step_index < len(self.timesteps):
429
+ prev_timestep = self.timesteps[prev_step_index]
430
  else:
431
  prev_timestep = timestep
432
+
433
  # 2. compute alphas, betas
434
  alpha_prod_t = self.alphas_cumprod[timestep]
435
  alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
436
+
437
  beta_prod_t = 1 - alpha_prod_t
438
  beta_prod_t_prev = 1 - alpha_prod_t_prev
439
+
440
  # 3. Get scalings for boundary conditions
441
  c_skip, c_out = self.get_scalings_for_boundary_condition_discrete(timestep)
442
+
443
+ # 4. Compute the predicted original sample x_0 based on the model parameterization
444
+ if self.config.prediction_type == "epsilon": # noise-prediction
445
+ predicted_original_sample = (sample - beta_prod_t.sqrt() * model_output) / alpha_prod_t.sqrt()
446
+ elif self.config.prediction_type == "sample": # x-prediction
447
+ predicted_original_sample = model_output
448
+ elif self.config.prediction_type == "v_prediction": # v-prediction
449
+ predicted_original_sample = alpha_prod_t.sqrt() * sample - beta_prod_t.sqrt() * model_output
450
+ else:
451
+ raise ValueError(
452
+ f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` or"
453
+ " `v_prediction` for `LCMScheduler`."
454
+ )
455
+
456
+ # 5. Clip or threshold "predicted x_0"
457
+ if self.config.thresholding:
458
+ predicted_original_sample = self._threshold_sample(predicted_original_sample)
459
+ elif self.config.clip_sample:
460
+ predicted_original_sample = predicted_original_sample.clamp(
461
+ -self.config.clip_sample_range, self.config.clip_sample_range
462
+ )
463
+
464
+ # 6. Denoise model output using boundary conditions
465
+ denoised = c_out * predicted_original_sample + c_skip * sample
466
+
467
+ # 7. Sample and inject noise z ~ N(0, I) for MultiStep Inference
468
  # Noise is not used for one-step sampling.
469
  if len(self.timesteps) > 1:
470
+ noise = randn_tensor(model_output.shape, generator=generator, device=model_output.device)
471
  prev_sample = alpha_prod_t_prev.sqrt() * denoised + beta_prod_t_prev.sqrt() * noise
472
  else:
473
  prev_sample = denoised
474
+
475
+ # upon completion increase step index by one
476
+ self._step_index += 1
477
+
478
  if not return_dict:
479
  return (prev_sample, denoised)
480
+
481
  return LCMSchedulerOutput(prev_sample=prev_sample, denoised=denoised)
 
482
 
483
  # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
484
  def add_noise(
model_index.json CHANGED
@@ -12,8 +12,8 @@
12
  "StableDiffusionSafetyChecker"
13
  ],
14
  "scheduler": [
15
- "diffusers",
16
- "PNDMScheduler"
17
  ],
18
  "text_encoder": [
19
  "transformers",
 
12
  "StableDiffusionSafetyChecker"
13
  ],
14
  "scheduler": [
15
+ null,
16
+ null
17
  ],
18
  "text_encoder": [
19
  "transformers",
scheduler/scheduler_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "_class_name": "PNDMScheduler",
3
- "_diffusers_version": "0.20.2",
4
  "beta_end": 0.012,
5
  "beta_schedule": "scaled_linear",
6
  "beta_start": 0.00085,
@@ -8,12 +8,12 @@
8
  "clip_sample_range": 1.0,
9
  "dynamic_thresholding_ratio": 0.995,
10
  "num_train_timesteps": 1000,
 
11
  "prediction_type": "epsilon",
12
  "rescale_betas_zero_snr": false,
13
  "sample_max_value": 1.0,
14
  "set_alpha_to_one": true,
15
- "skip_prk_steps": false,
16
- "steps_offset": 1,
17
  "thresholding": false,
18
  "timestep_spacing": "leading",
19
  "trained_betas": null
 
1
  {
2
+ "_class_name": "LCMScheduler",
3
+ "_diffusers_version": "0.22.0.dev0",
4
  "beta_end": 0.012,
5
  "beta_schedule": "scaled_linear",
6
  "beta_start": 0.00085,
 
8
  "clip_sample_range": 1.0,
9
  "dynamic_thresholding_ratio": 0.995,
10
  "num_train_timesteps": 1000,
11
+ "original_inference_steps": 50,
12
  "prediction_type": "epsilon",
13
  "rescale_betas_zero_snr": false,
14
  "sample_max_value": 1.0,
15
  "set_alpha_to_one": true,
16
+ "steps_offset": 0,
 
17
  "thresholding": false,
18
  "timestep_spacing": "leading",
19
  "trained_betas": null