tolgacangoz
/

matryoshka-diffusion-models

Text-to-Image

Diffusers

Safetensors

English

mdm

Model card Files Files and versions Community

tolgacangoz commited on Oct 6

Commit

dc031b3

•

1 Parent(s): 98b1844

Upload matryoshka.py

Browse files

Files changed (1) hide show

scheduler/matryoshka.py +116 -84

scheduler/matryoshka.py CHANGED Viewed

@@ -664,9 +664,7 @@ class MatryoshkaDDIMScheduler(SchedulerMixin, ConfigMixin):
                     variance_noise = []
                     for m_o in model_output:
                         variance_noise.append(
-                            randn_tensor(
-                                m_o.shape, generator=generator, device=m_o.device, dtype=m_o.dtype
-                            )
                         )
                 else:
                     variance_noise = randn_tensor(
@@ -1897,6 +1895,8 @@ class MatryoshkaCombinedTimestepTextEmbedding(nn.Module):
                     dim=1, keepdim=True
                 )
             cond_emb = self.cond_emb(y)
         if not masked_cross_attention:
             conditioning_mask = None
@@ -1905,11 +1905,8 @@ class MatryoshkaCombinedTimestepTextEmbedding(nn.Module):
         if micro is not None:
             temb = self.add_time_proj(torch.tensor([micro], device=emb.device, dtype=emb.dtype))
             temb_micro_conditioning = self.add_timestep_embedder(temb.to(emb.dtype))
-            if self.cond_emb is not None and not added_cond_kwargs.get("from_nested", False):
-                cond_emb_micro = cond_emb + temb_micro_conditioning
-                return cond_emb_micro, conditioning_mask, cond_emb
-            else:
-                return temb_micro_conditioning, conditioning_mask, None
         return cond_emb, conditioning_mask, cond_emb
@@ -3035,11 +3032,6 @@ class MatryoshkaUNet2DConditionModel(
             attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
             attention_mask = attention_mask.unsqueeze(1)
-        # convert encoder_attention_mask to a bias the same way we do for attention_mask
-        if encoder_attention_mask is not None:
-            encoder_attention_mask = (1 - encoder_attention_mask.to(sample[0][0].dtype)) * -10000.0
-            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
         # 0. center input if necessary
         if self.config.center_input_sample:
             sample = 2 * sample - 1.0
@@ -3059,6 +3051,7 @@ class MatryoshkaUNet2DConditionModel(
         added_cond_kwargs["masked_cross_attention"] = self.config.masked_cross_attention
         added_cond_kwargs["micro_conditioning_scale"] = self.config.micro_conditioning_scale
         added_cond_kwargs["from_nested"] = from_nested
         if not from_nested:
             encoder_hidden_states = self.process_encoder_hidden_states(
@@ -3073,6 +3066,11 @@ class MatryoshkaUNet2DConditionModel(
                 emb=emb, encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
             )
         if self.config.addition_embed_type == "image_hint":
             aug_emb, hint = aug_emb
             sample = torch.cat([sample, hint], dim=1)
@@ -3483,11 +3481,6 @@ class NestedUNet2DConditionModel(MatryoshkaUNet2DConditionModel):
             attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
             attention_mask = attention_mask.unsqueeze(1)
-        # convert encoder_attention_mask to a bias the same way we do for attention_mask
-        if encoder_attention_mask is not None:
-            encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
-            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
         # 0. center input if necessary
         if self.config.center_input_sample:
             sample = 2 * sample - 1.0
@@ -3507,21 +3500,22 @@ class NestedUNet2DConditionModel(MatryoshkaUNet2DConditionModel):
             added_cond_kwargs = added_cond_kwargs or {}
             added_cond_kwargs["masked_cross_attention"] = self.inner_unet.config.masked_cross_attention
             added_cond_kwargs["micro_conditioning_scale"] = self.config.micro_conditioning_scale
             if not self.config.nesting:
                 encoder_hidden_states = self.inner_unet.process_encoder_hidden_states(
                     encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
                 )
-                aug_emb_inner_unet, cond_mask_inner_unet, cond_emb = self.inner_unet.get_aug_embed(
                     emb=emb, encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
                 )
-                aug_emb, cond_mask, _ = self.get_aug_embed(
                     emb=emb, encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
                 )
             else:
-                aug_emb, cond_mask_inner_unet, _ = self.get_aug_embed(
                     emb=emb, encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
                 )
@@ -3529,19 +3523,25 @@ class NestedUNet2DConditionModel(MatryoshkaUNet2DConditionModel):
             added_cond_kwargs = added_cond_kwargs or {}
             added_cond_kwargs["masked_cross_attention"] = self.inner_unet.inner_unet.config.masked_cross_attention
             added_cond_kwargs["micro_conditioning_scale"] = self.config.micro_conditioning_scale
             encoder_hidden_states = self.inner_unet.inner_unet.process_encoder_hidden_states(
                 encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
             )
-            aug_emb_inner_unet, cond_mask_inner_unet, cond_emb = self.inner_unet.inner_unet.get_aug_embed(
                 emb=emb, encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
             )
-            aug_emb, cond_mask, _ = self.get_aug_embed(
                 emb=emb, encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
             )
         if self.config.addition_embed_type == "image_hint":
             aug_emb, hint = aug_emb
             sample = torch.cat([sample, hint], dim=1)
@@ -3623,7 +3623,7 @@ class NestedUNet2DConditionModel(MatryoshkaUNet2DConditionModel):
             timestep,
             cond_emb=cond_emb,
             encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=cond_mask_inner_unet,
             from_nested=True,
         )
         x_low, x_inner = inner_unet_output.sample, inner_unet_output.sample_inner
@@ -3911,9 +3911,6 @@ class MatryoshkaPipeline(
             text_inputs = self.tokenizer(
                 prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
                 return_tensors="pt",
             )
             text_input_ids = text_inputs.input_ids
@@ -3931,26 +3928,9 @@ class MatryoshkaPipeline(
                 )
             if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask.to(device)
             else:
-                attention_mask = None
-            if clip_skip is None:
-                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
-                prompt_embeds = prompt_embeds[0]
-            else:
-                prompt_embeds = self.text_encoder(
-                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
-                )
-                # Access the `hidden_states` first, that contains a tuple of
-                # all the hidden states from the encoder layers. Then index into
-                # the tuple to access the hidden states from the desired layer.
-                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
-                # We also need to apply the final LayerNorm here to not mess with the
-                # representations. The `last_hidden_states` that we typically use for
-                # obtaining the final prompt representations passes through the LayerNorm
-                # layer.
-                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
         if self.text_encoder is not None:
             prompt_embeds_dtype = self.text_encoder.dtype
@@ -3959,13 +3939,6 @@ class MatryoshkaPipeline(
         else:
             prompt_embeds_dtype = prompt_embeds.dtype
-        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
             uncond_tokens: List[str]
@@ -3991,41 +3964,78 @@ class MatryoshkaPipeline(
             if isinstance(self, TextualInversionLoaderMixin):
                 uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
-            max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
                 uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
                 return_tensors="pt",
             )
             if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask.to(device)
             else:
-                attention_mask = None
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids.to(device),
-                attention_mask=attention_mask,
             )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
         if self.text_encoder is not None:
             if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
                 # Retrieve the original scale by scaling back the LoRA layers
                 unscale_lora_layers(self.text_encoder, lora_scale)
-        return prompt_embeds, negative_prompt_embeds
     def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
         dtype = next(self.image_encoder.parameters()).dtype
@@ -4282,10 +4292,6 @@ class MatryoshkaPipeline(
     def interrupt(self):
         return self._interrupt
-    @property
-    def model_type(self):
-        return "nested_unet"
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
@@ -4462,7 +4468,12 @@ class MatryoshkaPipeline(
             self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
         )
-        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
             prompt,
             device,
             num_images_per_prompt,
@@ -4478,7 +4489,12 @@ class MatryoshkaPipeline(
         # Here we concatenate the unconditional and text embeddings into a single batch
         # to avoid doing two forward passes
         if self.do_classifier_free_guidance:
-            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
         if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
             image_embeds = self.prepare_ip_adapter_image_embeds(
@@ -4490,10 +4506,13 @@ class MatryoshkaPipeline(
             )
         # 4. Prepare timesteps
-        timesteps, num_inference_steps = retrieve_timesteps(
-            self.scheduler, num_inference_steps, device, timesteps, sigmas
-        )
-        timesteps = timesteps[:-1]
         # 5. Prepare latent variables
         num_channels_latents = self.unet.config.in_channels
@@ -4552,6 +4571,7 @@ class MatryoshkaPipeline(
                     timestep_cond=timestep_cond,
                     cross_attention_kwargs=self.cross_attention_kwargs,
                     added_cond_kwargs=added_cond_kwargs,
                     return_dict=False,
                 )[0]
@@ -4568,7 +4588,19 @@ class MatryoshkaPipeline(
                     noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
                 if callback_on_step_end is not None:
                     callback_kwargs = {}

                     variance_noise = []
                     for m_o in model_output:
                         variance_noise.append(
+                            randn_tensor(m_o.shape, generator=generator, device=m_o.device, dtype=m_o.dtype)
                         )
                 else:
                     variance_noise = randn_tensor(
                     dim=1, keepdim=True
                 )
             cond_emb = self.cond_emb(y)
+        else:
+            cond_emb = None
         if not masked_cross_attention:
             conditioning_mask = None
         if micro is not None:
             temb = self.add_time_proj(torch.tensor([micro], device=emb.device, dtype=emb.dtype))
             temb_micro_conditioning = self.add_timestep_embedder(temb.to(emb.dtype))
+            # if self.cond_emb is not None and not added_cond_kwargs.get("from_nested", False):
+            return temb_micro_conditioning, conditioning_mask, cond_emb
         return cond_emb, conditioning_mask, cond_emb
             attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
             attention_mask = attention_mask.unsqueeze(1)
         # 0. center input if necessary
         if self.config.center_input_sample:
             sample = 2 * sample - 1.0
         added_cond_kwargs["masked_cross_attention"] = self.config.masked_cross_attention
         added_cond_kwargs["micro_conditioning_scale"] = self.config.micro_conditioning_scale
         added_cond_kwargs["from_nested"] = from_nested
+        added_cond_kwargs["conditioning_mask"] = encoder_attention_mask
         if not from_nested:
             encoder_hidden_states = self.process_encoder_hidden_states(
                 emb=emb, encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
             )
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(sample[0][0].dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
         if self.config.addition_embed_type == "image_hint":
             aug_emb, hint = aug_emb
             sample = torch.cat([sample, hint], dim=1)
             attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
             attention_mask = attention_mask.unsqueeze(1)
         # 0. center input if necessary
         if self.config.center_input_sample:
             sample = 2 * sample - 1.0
             added_cond_kwargs = added_cond_kwargs or {}
             added_cond_kwargs["masked_cross_attention"] = self.inner_unet.config.masked_cross_attention
             added_cond_kwargs["micro_conditioning_scale"] = self.config.micro_conditioning_scale
+            added_cond_kwargs["conditioning_mask"] = encoder_attention_mask
             if not self.config.nesting:
                 encoder_hidden_states = self.inner_unet.process_encoder_hidden_states(
                     encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
                 )
+                aug_emb_inner_unet, cond_mask, cond_emb = self.inner_unet.get_aug_embed(
                     emb=emb, encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
                 )
+                added_cond_kwargs["masked_cross_attention"] = self.config.masked_cross_attention
+                aug_emb, __, _ = self.get_aug_embed(
                     emb=emb, encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
                 )
             else:
+                aug_emb, cond_mask, _ = self.get_aug_embed(
                     emb=emb, encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
                 )
             added_cond_kwargs = added_cond_kwargs or {}
             added_cond_kwargs["masked_cross_attention"] = self.inner_unet.inner_unet.config.masked_cross_attention
             added_cond_kwargs["micro_conditioning_scale"] = self.config.micro_conditioning_scale
+            added_cond_kwargs["conditioning_mask"] = encoder_attention_mask
             encoder_hidden_states = self.inner_unet.inner_unet.process_encoder_hidden_states(
                 encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
             )
+            aug_emb_inner_unet, cond_mask, cond_emb = self.inner_unet.inner_unet.get_aug_embed(
                 emb=emb, encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
             )
+            aug_emb, __, _ = self.get_aug_embed(
                 emb=emb, encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
             )
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
         if self.config.addition_embed_type == "image_hint":
             aug_emb, hint = aug_emb
             sample = torch.cat([sample, hint], dim=1)
             timestep,
             cond_emb=cond_emb,
             encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=cond_mask,
             from_nested=True,
         )
         x_low, x_inner = inner_unet_output.sample, inner_unet_output.sample_inner
             text_inputs = self.tokenizer(
                 prompt,
                 return_tensors="pt",
             )
             text_input_ids = text_inputs.input_ids
                 )
             if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                prompt_attention_mask = text_inputs.attention_mask.to(device)
             else:
+                prompt_attention_mask = None
         if self.text_encoder is not None:
             prompt_embeds_dtype = self.text_encoder.dtype
         else:
             prompt_embeds_dtype = prompt_embeds.dtype
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
             uncond_tokens: List[str]
             if isinstance(self, TextualInversionLoaderMixin):
                 uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
             uncond_input = self.tokenizer(
                 uncond_tokens,
                 return_tensors="pt",
             )
+            uncond_input_ids = uncond_input.input_ids
             if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                negative_prompt_attention_mask = uncond_input.attention_mask.to(device)
             else:
+                negative_prompt_attention_mask = None
+        if not do_classifier_free_guidance:
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=prompt_attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=prompt_attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+        else:
+            max_len = max(len(text_input_ids[0]), len(uncond_input_ids[0]))
+            if len(text_input_ids[0]) < max_len:
+                text_input_ids = torch.cat(
+                    [text_input_ids, torch.zeros(batch_size, max_len - len(text_input_ids[0]), dtype=torch.long)],
+                    dim=1,
+                )
+                prompt_attention_mask = torch.cat(
+                    [
+                        prompt_attention_mask,
+                        torch.zeros(batch_size, max_len - len(prompt_attention_mask[0]), dtype=torch.long),
+                    ],
+                    dim=1,
+                )
+            elif len(uncond_input_ids[0]) < max_len:
+                uncond_input_ids = torch.cat(
+                    [uncond_input_ids, torch.zeros(batch_size, max_len - len(uncond_input_ids[0]), dtype=torch.long)],
+                    dim=1,
+                )
+                negative_prompt_attention_mask = torch.cat(
+                    [
+                        negative_prompt_attention_mask,
+                        torch.zeros(batch_size, max_len - len(negative_prompt_attention_mask[0]), dtype=torch.long),
+                    ],
+                    dim=1,
+                )
+            cfg_input_ids = torch.cat([uncond_input_ids, text_input_ids], dim=0)
+            cfg_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
+            prompt_embeds = self.text_encoder(
+                cfg_input_ids.to(device),
+                attention_mask=cfg_attention_mask,
             )
+            prompt_embeds = prompt_embeds[0]
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
         if self.text_encoder is not None:
             if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
                 # Retrieve the original scale by scaling back the LoRA layers
                 unscale_lora_layers(self.text_encoder, lora_scale)
+        if not do_classifier_free_guidance:
+            return prompt_embeds, None, prompt_attention_mask, None
+        return prompt_embeds[1], prompt_embeds[0], prompt_attention_mask, negative_prompt_attention_mask
     def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
         dtype = next(self.image_encoder.parameters()).dtype
     def interrupt(self):
         return self._interrupt
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
             self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
         )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_attention_mask,
+        ) = self.encode_prompt(
             prompt,
             device,
             num_images_per_prompt,
         # Here we concatenate the unconditional and text embeddings into a single batch
         # to avoid doing two forward passes
         if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds.unsqueeze(0), prompt_embeds.unsqueeze(0)])
+            attention_masks = torch.cat([negative_prompt_attention_mask, prompt_attention_mask])
+        else:
+            attention_masks = prompt_attention_mask
+        prompt_embeds = prompt_embeds * attention_masks.unsqueeze(-1)
         if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
             image_embeds = self.prepare_ip_adapter_image_embeds(
             )
         # 4. Prepare timesteps
+        if isinstance(self.scheduler, MatryoshkaDDIMScheduler):
+            timesteps, num_inference_steps = retrieve_timesteps(
+                self.scheduler, num_inference_steps, device, timesteps, sigmas
+            )
+            timesteps = timesteps[:-1]  # is this correct???
+        else:
+            timesteps = self.scheduler.timesteps
         # 5. Prepare latent variables
         num_channels_latents = self.unet.config.in_channels
                     timestep_cond=timestep_cond,
                     cross_attention_kwargs=self.cross_attention_kwargs,
                     added_cond_kwargs=added_cond_kwargs,
+                    encoder_attention_mask=attention_masks,
                     return_dict=False,
                 )[0]
                     noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
                 # compute the previous noisy sample x_t -> x_t-1
+                if self.scheduler.scales is not None and not isinstance(self.scheduler, MatryoshkaDDIMScheduler):
+                    latents[0] = self.scheduler.step(
+                        noise_pred[0], t, latents[0], **extra_step_kwargs, return_dict=False
+                    )[0]
+                    latents[1] = self.scheduler.inner_scheduler.step(
+                        noise_pred[1], t, latents[1], **extra_step_kwargs, return_dict=False
+                    )[0]
+                    if len(latents) > 2:
+                        latents[2] = self.scheduler.inner_scheduler.inner_scheduler.step(
+                            noise_pred[2], t, latents[2], **extra_step_kwargs, return_dict=False
+                        )[0]
+                else:
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
                 if callback_on_step_end is not None:
                     callback_kwargs = {}