Spaces:
Runtime error
Runtime error
| # Attribution-NonCommercial 4.0 International (CC BY-NC 4.0) | |
| # William Peebles and Saining Xie | |
| # | |
| # Copyright (c) 2021 OpenAI | |
| # MIT License | |
| # | |
| # Copyright 2024 The HuggingFace Team. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| from typing import Dict, List, Optional, Tuple, Union | |
| import torch | |
| from ...models import AutoencoderKL, DiTTransformer2DModel | |
| from ...schedulers import KarrasDiffusionSchedulers | |
| from ...utils import is_torch_xla_available | |
| from ...utils.torch_utils import randn_tensor | |
| from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput | |
| if is_torch_xla_available(): | |
| import torch_xla.core.xla_model as xm | |
| XLA_AVAILABLE = True | |
| else: | |
| XLA_AVAILABLE = False | |
| class DiTPipeline(DiffusionPipeline): | |
| r""" | |
| Pipeline for image generation based on a Transformer backbone instead of a UNet. | |
| This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods | |
| implemented for all pipelines (downloading, saving, running on a particular device, etc.). | |
| Parameters: | |
| transformer ([`DiTTransformer2DModel`]): | |
| A class conditioned `DiTTransformer2DModel` to denoise the encoded image latents. | |
| vae ([`AutoencoderKL`]): | |
| Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations. | |
| scheduler ([`DDIMScheduler`]): | |
| A scheduler to be used in combination with `transformer` to denoise the encoded image latents. | |
| """ | |
| model_cpu_offload_seq = "transformer->vae" | |
| def __init__( | |
| self, | |
| transformer: DiTTransformer2DModel, | |
| vae: AutoencoderKL, | |
| scheduler: KarrasDiffusionSchedulers, | |
| id2label: Optional[Dict[int, str]] = None, | |
| ): | |
| super().__init__() | |
| self.register_modules(transformer=transformer, vae=vae, scheduler=scheduler) | |
| # create a imagenet -> id dictionary for easier use | |
| self.labels = {} | |
| if id2label is not None: | |
| for key, value in id2label.items(): | |
| for label in value.split(","): | |
| self.labels[label.lstrip().rstrip()] = int(key) | |
| self.labels = dict(sorted(self.labels.items())) | |
| def get_label_ids(self, label: Union[str, List[str]]) -> List[int]: | |
| r""" | |
| Map label strings from ImageNet to corresponding class ids. | |
| Parameters: | |
| label (`str` or `dict` of `str`): | |
| Label strings to be mapped to class ids. | |
| Returns: | |
| `list` of `int`: | |
| Class ids to be processed by pipeline. | |
| """ | |
| if not isinstance(label, list): | |
| label = list(label) | |
| for l in label: | |
| if l not in self.labels: | |
| raise ValueError( | |
| f"{l} does not exist. Please make sure to select one of the following labels: \n {self.labels}." | |
| ) | |
| return [self.labels[l] for l in label] | |
| def __call__( | |
| self, | |
| class_labels: List[int], | |
| guidance_scale: float = 4.0, | |
| generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, | |
| num_inference_steps: int = 50, | |
| output_type: Optional[str] = "pil", | |
| return_dict: bool = True, | |
| ) -> Union[ImagePipelineOutput, Tuple]: | |
| r""" | |
| The call function to the pipeline for generation. | |
| Args: | |
| class_labels (List[int]): | |
| List of ImageNet class labels for the images to be generated. | |
| guidance_scale (`float`, *optional*, defaults to 4.0): | |
| A higher guidance scale value encourages the model to generate images closely linked to the text | |
| `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. | |
| generator (`torch.Generator`, *optional*): | |
| A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make | |
| generation deterministic. | |
| num_inference_steps (`int`, *optional*, defaults to 250): | |
| The number of denoising steps. More denoising steps usually lead to a higher quality image at the | |
| expense of slower inference. | |
| output_type (`str`, *optional*, defaults to `"pil"`): | |
| The output format of the generated image. Choose between `PIL.Image` or `np.array`. | |
| return_dict (`bool`, *optional*, defaults to `True`): | |
| Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple. | |
| Examples: | |
| ```py | |
| >>> from diffusers import DiTPipeline, DPMSolverMultistepScheduler | |
| >>> import torch | |
| >>> pipe = DiTPipeline.from_pretrained("facebook/DiT-XL-2-256", torch_dtype=torch.float16) | |
| >>> pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) | |
| >>> pipe = pipe.to("cuda") | |
| >>> # pick words from Imagenet class labels | |
| >>> pipe.labels # to print all available words | |
| >>> # pick words that exist in ImageNet | |
| >>> words = ["white shark", "umbrella"] | |
| >>> class_ids = pipe.get_label_ids(words) | |
| >>> generator = torch.manual_seed(33) | |
| >>> output = pipe(class_labels=class_ids, num_inference_steps=25, generator=generator) | |
| >>> image = output.images[0] # label 'white shark' | |
| ``` | |
| Returns: | |
| [`~pipelines.ImagePipelineOutput`] or `tuple`: | |
| If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is | |
| returned where the first element is a list with the generated images | |
| """ | |
| batch_size = len(class_labels) | |
| latent_size = self.transformer.config.sample_size | |
| latent_channels = self.transformer.config.in_channels | |
| latents = randn_tensor( | |
| shape=(batch_size, latent_channels, latent_size, latent_size), | |
| generator=generator, | |
| device=self._execution_device, | |
| dtype=self.transformer.dtype, | |
| ) | |
| latent_model_input = torch.cat([latents] * 2) if guidance_scale > 1 else latents | |
| class_labels = torch.tensor(class_labels, device=self._execution_device).reshape(-1) | |
| class_null = torch.tensor([1000] * batch_size, device=self._execution_device) | |
| class_labels_input = torch.cat([class_labels, class_null], 0) if guidance_scale > 1 else class_labels | |
| # set step values | |
| self.scheduler.set_timesteps(num_inference_steps) | |
| for t in self.progress_bar(self.scheduler.timesteps): | |
| if guidance_scale > 1: | |
| half = latent_model_input[: len(latent_model_input) // 2] | |
| latent_model_input = torch.cat([half, half], dim=0) | |
| latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) | |
| timesteps = t | |
| if not torch.is_tensor(timesteps): | |
| # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can | |
| # This would be a good case for the `match` statement (Python 3.10+) | |
| is_mps = latent_model_input.device.type == "mps" | |
| is_npu = latent_model_input.device.type == "npu" | |
| if isinstance(timesteps, float): | |
| dtype = torch.float32 if (is_mps or is_npu) else torch.float64 | |
| else: | |
| dtype = torch.int32 if (is_mps or is_npu) else torch.int64 | |
| timesteps = torch.tensor([timesteps], dtype=dtype, device=latent_model_input.device) | |
| elif len(timesteps.shape) == 0: | |
| timesteps = timesteps[None].to(latent_model_input.device) | |
| # broadcast to batch dimension in a way that's compatible with ONNX/Core ML | |
| timesteps = timesteps.expand(latent_model_input.shape[0]) | |
| # predict noise model_output | |
| noise_pred = self.transformer( | |
| latent_model_input, timestep=timesteps, class_labels=class_labels_input | |
| ).sample | |
| # perform guidance | |
| if guidance_scale > 1: | |
| eps, rest = noise_pred[:, :latent_channels], noise_pred[:, latent_channels:] | |
| cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0) | |
| half_eps = uncond_eps + guidance_scale * (cond_eps - uncond_eps) | |
| eps = torch.cat([half_eps, half_eps], dim=0) | |
| noise_pred = torch.cat([eps, rest], dim=1) | |
| # learned sigma | |
| if self.transformer.config.out_channels // 2 == latent_channels: | |
| model_output, _ = torch.split(noise_pred, latent_channels, dim=1) | |
| else: | |
| model_output = noise_pred | |
| # compute previous image: x_t -> x_t-1 | |
| latent_model_input = self.scheduler.step(model_output, t, latent_model_input).prev_sample | |
| if XLA_AVAILABLE: | |
| xm.mark_step() | |
| if guidance_scale > 1: | |
| latents, _ = latent_model_input.chunk(2, dim=0) | |
| else: | |
| latents = latent_model_input | |
| latents = 1 / self.vae.config.scaling_factor * latents | |
| samples = self.vae.decode(latents).sample | |
| samples = (samples / 2 + 0.5).clamp(0, 1) | |
| # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 | |
| samples = samples.cpu().permute(0, 2, 3, 1).float().numpy() | |
| if output_type == "pil": | |
| samples = self.numpy_to_pil(samples) | |
| # Offload all models | |
| self.maybe_free_model_hooks() | |
| if not return_dict: | |
| return (samples,) | |
| return ImagePipelineOutput(images=samples) | |