Spaces:
Build error
Build error
| import einops | |
| import inspect | |
| import torch | |
| import numpy as np | |
| import PIL | |
| import os | |
| from dataclasses import dataclass | |
| from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler | |
| from diffusers.pipelines.pipeline_utils import DiffusionPipeline | |
| from diffusers.utils import ( | |
| CONFIG_NAME, | |
| DEPRECATED_REVISION_ARGS, | |
| BaseOutput, | |
| PushToHubMixin, | |
| deprecate, | |
| is_accelerate_available, | |
| is_accelerate_version, | |
| is_torch_npu_available, | |
| is_torch_version, | |
| logging, | |
| numpy_to_pil, | |
| replace_example_docstring, | |
| ) | |
| from diffusers.models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, ModelMixin | |
| from diffusers.utils.torch_utils import randn_tensor | |
| from diffusers.utils import BaseOutput | |
| # from diffusers.image_processor import VaeImageProcessor | |
| from transformers import T5EncoderModel, T5Tokenizer | |
| from typing import Any, Callable, Dict, List, Optional, Union | |
| from PIL import Image | |
| from onediffusion.models.denoiser.nextdit import NextDiT | |
| from onediffusion.dataset.utils import * | |
| from onediffusion.dataset.multitask.multiview import calculate_rays | |
| from onediffusion.diffusion.pipelines.image_processor import VaeImageProcessorOneDiffuser | |
| logger = logging.get_logger(__name__) # pylint: disable=invalid-name | |
| SUPPORTED_DEVICE_MAP = ["balanced"] | |
| EXAMPLE_DOC_STRING = """ | |
| Examples: | |
| ```py | |
| >>> import torch | |
| >>> from one_diffusion import OneDiffusionPipeline | |
| >>> pipe = OneDiffusionPipeline.from_pretrained("path_to_one_diffuser_model") | |
| >>> pipe = pipe.to("cuda") | |
| >>> prompt = "A beautiful sunset over the ocean" | |
| >>> image = pipe(prompt).images[0] | |
| >>> image.save("beautiful_sunset.png") | |
| ``` | |
| """ | |
| def create_c2w_matrix(azimuth_deg, elevation_deg, distance=1.0, target=np.array([0, 0, 0])): | |
| """ | |
| Create a Camera-to-World (C2W) matrix from azimuth and elevation angles. | |
| Parameters: | |
| - azimuth_deg: Azimuth angle in degrees. | |
| - elevation_deg: Elevation angle in degrees. | |
| - distance: Distance from the target point. | |
| - target: The point the camera is looking at in world coordinates. | |
| Returns: | |
| - C2W: A 4x4 NumPy array representing the Camera-to-World transformation matrix. | |
| """ | |
| # Convert angles from degrees to radians | |
| azimuth = np.deg2rad(azimuth_deg) | |
| elevation = np.deg2rad(elevation_deg) | |
| # Spherical to Cartesian conversion for camera position | |
| x = distance * np.cos(elevation) * np.cos(azimuth) | |
| y = distance * np.cos(elevation) * np.sin(azimuth) | |
| z = distance * np.sin(elevation) | |
| camera_position = np.array([x, y, z]) | |
| # Define the forward vector (from camera to target) | |
| target = 2*camera_position - target | |
| forward = target - camera_position | |
| forward /= np.linalg.norm(forward) | |
| # Define the world up vector | |
| world_up = np.array([0, 0, 1]) | |
| # Compute the right vector | |
| right = np.cross(world_up, forward) | |
| if np.linalg.norm(right) < 1e-6: | |
| # Handle the singularity when forward is parallel to world_up | |
| world_up = np.array([0, 1, 0]) | |
| right = np.cross(world_up, forward) | |
| right /= np.linalg.norm(right) | |
| # Recompute the orthogonal up vector | |
| up = np.cross(forward, right) | |
| # Construct the rotation matrix | |
| rotation = np.vstack([right, up, forward]).T # 3x3 | |
| # Construct the full C2W matrix | |
| C2W = np.eye(4) | |
| C2W[:3, :3] = rotation | |
| C2W[:3, 3] = camera_position | |
| return C2W | |
| class OneDiffusionPipelineOutput(BaseOutput): | |
| """ | |
| Output class for Stable Diffusion pipelines. | |
| Args: | |
| images (`List[PIL.Image.Image]` or `np.ndarray`) | |
| List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width, | |
| num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline. | |
| """ | |
| images: Union[List[Image.Image], np.ndarray] | |
| latents: Optional[torch.Tensor] = None | |
| def retrieve_latents( | |
| encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample" | |
| ): | |
| if hasattr(encoder_output, "latent_dist") and sample_mode == "sample": | |
| return encoder_output.latent_dist.sample(generator) | |
| elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax": | |
| return encoder_output.latent_dist.mode() | |
| elif hasattr(encoder_output, "latents"): | |
| return encoder_output.latents | |
| else: | |
| raise AttributeError("Could not access latents of provided encoder_output") | |
| def calculate_shift( | |
| image_seq_len, | |
| base_seq_len: int = 256, | |
| max_seq_len: int = 4096, | |
| base_shift: float = 0.5, | |
| max_shift: float = 1.16, | |
| # max_clip: float = 1.5, | |
| ): | |
| m = (max_shift - base_shift) / (max_seq_len - base_seq_len) # 0.000169270833 | |
| b = base_shift - m * base_seq_len # 0.5-0.0433333332 | |
| mu = image_seq_len * m + b | |
| # mu = min(mu, max_clip) | |
| return mu | |
| # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps | |
| def retrieve_timesteps( | |
| scheduler, | |
| num_inference_steps: Optional[int] = None, | |
| device: Optional[Union[str, torch.device]] = None, | |
| timesteps: Optional[List[int]] = None, | |
| sigmas: Optional[List[float]] = None, | |
| **kwargs, | |
| ): | |
| """ | |
| Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles | |
| custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. | |
| Args: | |
| scheduler (`SchedulerMixin`): | |
| The scheduler to get timesteps from. | |
| num_inference_steps (`int`): | |
| The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` | |
| must be `None`. | |
| device (`str` or `torch.device`, *optional*): | |
| The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. | |
| timesteps (`List[int]`, *optional*): | |
| Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, | |
| `num_inference_steps` and `sigmas` must be `None`. | |
| sigmas (`List[float]`, *optional*): | |
| Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, | |
| `num_inference_steps` and `timesteps` must be `None`. | |
| Returns: | |
| `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the | |
| second element is the number of inference steps. | |
| """ | |
| if timesteps is not None and sigmas is not None: | |
| raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") | |
| if timesteps is not None: | |
| accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) | |
| if not accepts_timesteps: | |
| raise ValueError( | |
| f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" | |
| f" timestep schedules. Please check whether you are using the correct scheduler." | |
| ) | |
| scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) | |
| timesteps = scheduler.timesteps | |
| num_inference_steps = len(timesteps) | |
| elif sigmas is not None: | |
| accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) | |
| if not accept_sigmas: | |
| raise ValueError( | |
| f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" | |
| f" sigmas schedules. Please check whether you are using the correct scheduler." | |
| ) | |
| scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) | |
| timesteps = scheduler.timesteps | |
| num_inference_steps = len(timesteps) | |
| else: | |
| scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) | |
| timesteps = scheduler.timesteps | |
| return timesteps, num_inference_steps | |
| class OneDiffusionPipeline(DiffusionPipeline): | |
| r""" | |
| Pipeline for text-to-image generation using OneDiffuser. | |
| This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the | |
| library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) | |
| Args: | |
| transformer ([`NextDiT`]): | |
| Conditional transformer (NextDiT) architecture to denoise the encoded image latents. | |
| vae ([`AutoencoderKL`]): | |
| Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. | |
| text_encoder ([`T5EncoderModel`]): | |
| Frozen text-encoder. OneDiffuser uses the T5 model as text encoder. | |
| tokenizer (`T5Tokenizer`): | |
| Tokenizer of class T5Tokenizer. | |
| scheduler ([`FlowMatchEulerDiscreteScheduler`]): | |
| A scheduler to be used in combination with `transformer` to denoise the encoded image latents. | |
| """ | |
| def __init__( | |
| self, | |
| transformer: NextDiT, | |
| vae: AutoencoderKL, | |
| text_encoder: T5EncoderModel, | |
| tokenizer: T5Tokenizer, | |
| scheduler: FlowMatchEulerDiscreteScheduler, | |
| ): | |
| super().__init__() | |
| self.register_modules( | |
| transformer=transformer, | |
| vae=vae, | |
| text_encoder=text_encoder, | |
| tokenizer=tokenizer, | |
| scheduler=scheduler, | |
| ) | |
| self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) | |
| self.image_processor = VaeImageProcessorOneDiffuser(vae_scale_factor=self.vae_scale_factor) | |
| def enable_vae_slicing(self): | |
| self.vae.enable_slicing() | |
| def disable_vae_slicing(self): | |
| self.vae.disable_slicing() | |
| def enable_sequential_cpu_offload(self, gpu_id=0): | |
| if is_accelerate_available(): | |
| from accelerate import cpu_offload | |
| else: | |
| raise ImportError("Please install accelerate via `pip install accelerate`") | |
| device = torch.device(f"cuda:{gpu_id}") | |
| for cpu_offloaded_model in [self.transformer, self.text_encoder, self.vae]: | |
| if cpu_offloaded_model is not None: | |
| cpu_offload(cpu_offloaded_model, device) | |
| def _execution_device(self): | |
| if self.device != torch.device("meta") or not hasattr(self.transformer, "_hf_hook"): | |
| return self.device | |
| for module in self.transformer.modules(): | |
| if ( | |
| hasattr(module, "_hf_hook") | |
| and hasattr(module._hf_hook, "execution_device") | |
| and module._hf_hook.execution_device is not None | |
| ): | |
| return torch.device(module._hf_hook.execution_device) | |
| return self.device | |
| def encode_prompt( | |
| self, | |
| prompt, | |
| device, | |
| num_images_per_prompt, | |
| do_classifier_free_guidance, | |
| negative_prompt=None, | |
| max_length=300, | |
| ): | |
| batch_size = len(prompt) if isinstance(prompt, list) else 1 | |
| text_inputs = self.tokenizer( | |
| prompt, | |
| padding="max_length", | |
| max_length=max_length, | |
| truncation=True, | |
| add_special_tokens=True, | |
| return_tensors="pt", | |
| ) | |
| text_input_ids = text_inputs.input_ids | |
| attention_mask = text_inputs.attention_mask | |
| untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids | |
| if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids): | |
| removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1]) | |
| logger.warning( | |
| "The following part of your input was truncated because CLIP can only handle sequences up to" | |
| f" {max_length} tokens: {removed_text}" | |
| ) | |
| text_encoder_output = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask.to(device)) | |
| prompt_embeds = text_encoder_output[0].to(torch.float32) | |
| # duplicate text embeddings for each generation per prompt, using mps friendly method | |
| bs_embed, seq_len, _ = prompt_embeds.shape | |
| prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) | |
| prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) | |
| # duplicate attention mask for each generation per prompt | |
| attention_mask = attention_mask.repeat(1, num_images_per_prompt) | |
| attention_mask = attention_mask.view(bs_embed * num_images_per_prompt, -1) | |
| # get unconditional embeddings for classifier free guidance | |
| if do_classifier_free_guidance: | |
| uncond_tokens: List[str] | |
| if negative_prompt is None: | |
| uncond_tokens = [""] * batch_size | |
| elif type(prompt) is not type(negative_prompt): | |
| raise TypeError( | |
| f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" | |
| f" {type(prompt)}." | |
| ) | |
| elif isinstance(negative_prompt, str): | |
| uncond_tokens = [negative_prompt] | |
| elif batch_size != len(negative_prompt): | |
| raise ValueError( | |
| f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" | |
| f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" | |
| " the batch size of `prompt`." | |
| ) | |
| else: | |
| uncond_tokens = negative_prompt | |
| max_length = text_input_ids.shape[-1] | |
| uncond_input = self.tokenizer( | |
| uncond_tokens, | |
| padding="max_length", | |
| max_length=max_length, | |
| truncation=True, | |
| return_tensors="pt", | |
| ) | |
| uncond_encoder_output = self.text_encoder(uncond_input.input_ids.to(device), attention_mask=uncond_input.attention_mask.to(device)) | |
| negative_prompt_embeds = uncond_encoder_output[0].to(torch.float32) | |
| # duplicate unconditional embeddings for each generation per prompt, using mps friendly method | |
| seq_len = negative_prompt_embeds.shape[1] | |
| negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) | |
| negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) | |
| # duplicate unconditional attention mask for each generation per prompt | |
| uncond_attention_mask = uncond_input.attention_mask.repeat(1, num_images_per_prompt) | |
| uncond_attention_mask = uncond_attention_mask.view(batch_size * num_images_per_prompt, -1) | |
| # For classifier free guidance, we need to do two forward passes. | |
| # Here we concatenate the unconditional and text embeddings into a single batch | |
| # to avoid doing two forward passes | |
| prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) | |
| attention_mask = torch.cat([uncond_attention_mask, attention_mask]) | |
| return prompt_embeds.to(device), attention_mask.to(device) | |
| def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None): | |
| shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) | |
| if isinstance(generator, list) and len(generator) != batch_size: | |
| raise ValueError( | |
| f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" | |
| f" size of {batch_size}. Make sure the batch size matches the length of the generators." | |
| ) | |
| if latents is None: | |
| latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) | |
| else: | |
| latents = latents.to(device) | |
| # scale the initial noise by the standard deviation required by the scheduler | |
| latents = latents * self.scheduler.init_noise_sigma | |
| return latents | |
| def __call__( | |
| self, | |
| prompt: Union[str, List[str]] = None, | |
| height: Optional[int] = None, | |
| width: Optional[int] = None, | |
| num_inference_steps: int = 50, | |
| guidance_scale: float = 5.0, | |
| negative_prompt: Optional[Union[str, List[str]]] = None, | |
| num_images_per_prompt: Optional[int] = 1, | |
| eta: float = 0.0, | |
| generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, | |
| latents: Optional[torch.FloatTensor] = None, | |
| output_type: Optional[str] = "pil", | |
| return_dict: bool = True, | |
| callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, | |
| callback_steps: int = 1, | |
| forward_kwargs: Optional[Dict[str, Any]] = {}, | |
| **kwargs, | |
| ): | |
| r""" | |
| Function invoked when calling the pipeline for generation. | |
| Args: | |
| prompt (`str` or `List[str]`, *optional*): | |
| The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. | |
| height (`int`, *optional*, defaults to self.transformer.config.sample_size): | |
| The height in pixels of the generated image. | |
| width (`int`, *optional*, defaults to self.transformer.config.sample_size): | |
| The width in pixels of the generated image. | |
| num_inference_steps (`int`, *optional*, defaults to 50): | |
| The number of denoising steps. More denoising steps usually lead to a higher quality image at the | |
| expense of slower inference. | |
| guidance_scale (`float`, *optional*, defaults to 7.5): | |
| Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). | |
| `guidance_scale` is defined as `w` of equation 2. of [Imagen | |
| Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > | |
| 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, | |
| usually at the expense of lower image quality. | |
| negative_prompt (`str` or `List[str]`, *optional*): | |
| The prompt or prompts not to guide the image generation. If not defined, one has to pass | |
| `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is | |
| less than `1`). | |
| num_images_per_prompt (`int`, *optional*, defaults to 1): | |
| The number of images to generate per prompt. | |
| eta (`float`, *optional*, defaults to 0.0): | |
| Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to | |
| [`schedulers.DDIMScheduler`], will be ignored for others. | |
| generator (`torch.Generator` or `List[torch.Generator]`, *optional*): | |
| One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) | |
| to make generation deterministic. | |
| latents (`torch.FloatTensor`, *optional*): | |
| Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image | |
| generation. Can be used to tweak the same generation with different prompts. If not provided, a latents | |
| tensor will ge generated by sampling using the supplied random `generator`. | |
| output_type (`str`, *optional*, defaults to `"pil"`): | |
| The output format of the generate image. Choose between | |
| [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. | |
| return_dict (`bool`, *optional*, defaults to `True`): | |
| Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a | |
| plain tuple. | |
| callback (`Callable`, *optional*): | |
| A function that will be called every `callback_steps` steps during inference. The function will be | |
| called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. | |
| callback_steps (`int`, *optional*, defaults to 1): | |
| The frequency at which the `callback` function will be called. If not specified, the callback will be | |
| called at every step. | |
| Examples: | |
| Returns: | |
| [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: | |
| [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. | |
| When returning a tuple, the first element is a list with the generated images, and the second element is a | |
| list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" | |
| (nsfw) content, according to the `safety_checker`. | |
| """ | |
| height = height or self.transformer.config.input_size[-2] * 8 # TODO: Hardcoded downscale factor of vae | |
| width = width or self.transformer.config.input_size[-1] * 8 | |
| # check inputs. Raise error if not correct | |
| self.check_inputs(prompt, height, width, callback_steps) | |
| # define call parameters | |
| batch_size = 1 if isinstance(prompt, str) else len(prompt) | |
| device = self._execution_device | |
| # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) | |
| # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf | |
| do_classifier_free_guidance = guidance_scale > 1.0 | |
| encoder_hidden_states, encoder_attention_mask = self.encode_prompt( | |
| prompt, | |
| device, | |
| num_images_per_prompt, | |
| do_classifier_free_guidance, | |
| negative_prompt, | |
| ) | |
| # set timesteps | |
| # # self.scheduler.set_timesteps(num_inference_steps, device=device) | |
| # timesteps = self.scheduler.timesteps | |
| timesteps = None | |
| # prepare latent variables | |
| num_channels_latents = self.transformer.config.in_channels | |
| latents = self.prepare_latents( | |
| batch_size * num_images_per_prompt, | |
| num_channels_latents, | |
| height, | |
| width, | |
| self.dtype, | |
| device, | |
| generator, | |
| latents, | |
| ) | |
| # prepare extra step kwargs | |
| extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) | |
| # 5. Prepare timesteps | |
| sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) | |
| image_seq_len = latents.shape[-1] * latents.shape[-2] / self.transformer.config.patch_size[-1] / self.transformer.config.patch_size[-2] | |
| mu = calculate_shift( | |
| image_seq_len, | |
| self.scheduler.config.base_image_seq_len, | |
| self.scheduler.config.max_image_seq_len, | |
| self.scheduler.config.base_shift, | |
| self.scheduler.config.max_shift, | |
| ) | |
| timesteps, num_inference_steps = retrieve_timesteps( | |
| self.scheduler, | |
| num_inference_steps, | |
| device, | |
| timesteps, | |
| sigmas, | |
| mu=mu, | |
| ) | |
| num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) | |
| self._num_timesteps = len(timesteps) | |
| # denoising loop | |
| num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order | |
| with self.progress_bar(total=num_inference_steps) as progress_bar: | |
| for i, t in enumerate(timesteps): | |
| # expand the latents if we are doing classifier free guidance | |
| latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents | |
| # latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) | |
| # predict the noise residual | |
| noise_pred = self.transformer( | |
| samples=latent_model_input.to(self.dtype), | |
| timesteps=torch.tensor([t] * latent_model_input.shape[0], device=device), | |
| encoder_hidden_states=encoder_hidden_states.to(self.dtype), | |
| encoder_attention_mask=encoder_attention_mask, | |
| **forward_kwargs | |
| ) | |
| # perform guidance | |
| if do_classifier_free_guidance: | |
| noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) | |
| noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) | |
| # compute the previous noisy sample x_t -> x_t-1 | |
| latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample | |
| # call the callback, if provided | |
| if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): | |
| progress_bar.update() | |
| if callback is not None and i % callback_steps == 0: | |
| callback(i, t, latents) | |
| # scale and decode the image latents with vae | |
| latents = 1 / self.vae.config.scaling_factor * latents | |
| if latents.ndim == 5: | |
| latents = latents.squeeze(1) | |
| image = self.vae.decode(latents.to(self.vae.dtype)).sample | |
| image = (image / 2 + 0.5).clamp(0, 1) | |
| image = image.cpu().permute(0, 2, 3, 1).float().numpy() | |
| if output_type == "pil": | |
| image = self.numpy_to_pil(image) | |
| if not return_dict: | |
| return (image, None) | |
| return OneDiffusionPipelineOutput(images=image) | |
| def img2img( | |
| self, | |
| prompt: Union[str, List[str]] = None, | |
| image: Union[PIL.Image.Image, List[PIL.Image.Image]] = None, | |
| height: Optional[int] = None, | |
| width: Optional[int] = None, | |
| num_inference_steps: int = 50, | |
| guidance_scale: float = 5.0, | |
| denoise_mask: Optional[List[int]] = [1, 0], | |
| negative_prompt: Optional[Union[str, List[str]]] = None, | |
| num_images_per_prompt: Optional[int] = 1, | |
| eta: float = 0.0, | |
| generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, | |
| latents: Optional[torch.FloatTensor] = None, | |
| output_type: Optional[str] = "pil", | |
| return_dict: bool = True, | |
| callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, | |
| callback_steps: int = 1, | |
| do_crop: bool = True, | |
| is_multiview: bool = False, | |
| multiview_azimuths: Optional[List[int]] = [0, 30, 60, 90], | |
| multiview_elevations: Optional[List[int]] = [0, 0, 0, 0], | |
| multiview_distances: float = 1.7, | |
| multiview_c2ws: Optional[List[torch.Tensor]] = None, | |
| multiview_intrinsics: Optional[torch.Tensor] = None, | |
| multiview_focal_length: float = 1.3887, | |
| forward_kwargs: Optional[Dict[str, Any]] = {}, | |
| noise_scale: float = 1.0, | |
| **kwargs, | |
| ): | |
| # Convert single image to list for consistent handling | |
| if isinstance(image, PIL.Image.Image): | |
| image = [image] | |
| if height is None or width is None: | |
| closest_ar = get_closest_ratio(height=image[0].size[1], width=image[0].size[0], ratios=ASPECT_RATIO_512) | |
| height, width = int(closest_ar[0][0]), int(closest_ar[0][1]) | |
| if not isinstance(multiview_distances, list) and not isinstance(multiview_distances, tuple): | |
| multiview_distances = [multiview_distances] * len(multiview_azimuths) | |
| # height = height or self.transformer.config.input_size[-2] * 8 # TODO: Hardcoded downscale factor of vae | |
| # width = width or self.transformer.config.input_size[-1] * 8 | |
| # 1. check inputs. Raise error if not correct | |
| self.check_inputs(prompt, height, width, callback_steps) | |
| # Additional input validation for image list | |
| if not all(isinstance(img, PIL.Image.Image) for img in image): | |
| raise ValueError("All elements in image list must be PIL.Image objects") | |
| # 2. define call parameters | |
| batch_size = 1 if isinstance(prompt, str) else len(prompt) | |
| device = self._execution_device | |
| do_classifier_free_guidance = guidance_scale > 1.0 | |
| # 3. Encode input prompt | |
| encoder_hidden_states, encoder_attention_mask = self.encode_prompt( | |
| prompt, | |
| device, | |
| num_images_per_prompt, | |
| do_classifier_free_guidance, | |
| negative_prompt, | |
| ) | |
| # 4. Preprocess all images | |
| if image is not None and len(image) > 0: | |
| processed_image = self.image_processor.preprocess(image, height=height, width=width, do_crop=do_crop) | |
| else: | |
| processed_image = None | |
| # # Stack processed images along the sequence dimension | |
| # if len(processed_images) > 1: | |
| # processed_image = torch.cat(processed_images, dim=0) | |
| # else: | |
| # processed_image = processed_images[0] | |
| timesteps = None | |
| # 6. prepare latent variables | |
| num_channels_latents = self.transformer.config.in_channels | |
| if processed_image is not None: | |
| cond_latents = self.prepare_latents( | |
| batch_size * num_images_per_prompt, | |
| num_channels_latents, | |
| height, | |
| width, | |
| self.dtype, | |
| device, | |
| generator, | |
| latents, | |
| image=processed_image, | |
| ) | |
| else: | |
| cond_latents = None | |
| # 7. prepare extra step kwargs | |
| extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) | |
| denoise_mask = torch.tensor(denoise_mask, device=device) | |
| denoise_indices = torch.where(denoise_mask == 1)[0] | |
| cond_indices = torch.where(denoise_mask == 0)[0] | |
| seq_length = denoise_mask.shape[0] | |
| latents = self.prepare_init_latents( | |
| batch_size * num_images_per_prompt, | |
| seq_length, | |
| num_channels_latents, | |
| height, | |
| width, | |
| self.dtype, | |
| device, | |
| generator, | |
| ) | |
| # 5. Prepare timesteps | |
| sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) | |
| # image_seq_len = latents.shape[1] * latents.shape[-1] * latents.shape[-2] / self.transformer.config.patch_size[-1] / self.transformer.config.patch_size[-2] | |
| image_seq_len = noise_scale * sum(denoise_mask) * latents.shape[-1] * latents.shape[-2] / self.transformer.config.patch_size[-1] / self.transformer.config.patch_size[-2] | |
| # image_seq_len = 256 | |
| mu = calculate_shift( | |
| image_seq_len, | |
| self.scheduler.config.base_image_seq_len, | |
| self.scheduler.config.max_image_seq_len, | |
| self.scheduler.config.base_shift, | |
| self.scheduler.config.max_shift, | |
| ) | |
| timesteps, num_inference_steps = retrieve_timesteps( | |
| self.scheduler, | |
| num_inference_steps, | |
| device, | |
| timesteps, | |
| sigmas, | |
| mu=mu, | |
| ) | |
| num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) | |
| self._num_timesteps = len(timesteps) | |
| if is_multiview: | |
| cond_indices_images = [index // 2 for index in cond_indices if index % 2 == 0] | |
| cond_indices_rays = [index // 2 for index in cond_indices if index % 2 == 1] | |
| multiview_elevations = [element for element in multiview_elevations if element is not None] | |
| multiview_azimuths = [element for element in multiview_azimuths if element is not None] | |
| multiview_distances = [element for element in multiview_distances if element is not None] | |
| if multiview_c2ws is None: | |
| multiview_c2ws = [ | |
| torch.tensor(create_c2w_matrix(azimuth, elevation, distance)) for azimuth, elevation, distance in zip(multiview_azimuths, multiview_elevations, multiview_distances) | |
| ] | |
| c2ws = torch.stack(multiview_c2ws).float() | |
| else: | |
| c2ws = torch.Tensor(multiview_c2ws).float() | |
| c2ws[:, 0:3, 1:3] *= -1 | |
| c2ws = c2ws[:, [1, 0, 2, 3], :] | |
| c2ws[:, 2, :] *= -1 | |
| w2cs = torch.inverse(c2ws) | |
| if multiview_intrinsics is None: | |
| multiview_intrinsics = torch.Tensor([[[multiview_focal_length, 0, 0.5], [0, multiview_focal_length, 0.5], [0, 0, 1]]]).repeat(c2ws.shape[0], 1, 1) | |
| K = multiview_intrinsics | |
| Rs = w2cs[:, :3, :3] | |
| Ts = w2cs[:, :3, 3] | |
| sizes = torch.Tensor([[1, 1]]).repeat(c2ws.shape[0], 1) | |
| assert height == width | |
| cond_rays = calculate_rays(K, sizes, Rs, Ts, height // 8) | |
| cond_rays = cond_rays.reshape(-1, height // 8, width // 8, 6) | |
| # padding = (0, 10) | |
| # cond_rays = torch.nn.functional.pad(cond_rays, padding, "constant", 0) | |
| cond_rays = torch.cat([cond_rays, cond_rays, cond_rays[..., :4]], dim=-1) * 1.658 | |
| cond_rays = cond_rays[None].repeat(batch_size * num_images_per_prompt, 1, 1, 1, 1) | |
| cond_rays = cond_rays.permute(0, 1, 4, 2, 3) | |
| cond_rays = cond_rays.to(device, dtype=self.dtype) | |
| latents = einops.rearrange(latents, "b (f n) c h w -> b f n c h w", n=2) | |
| if cond_latents is not None: | |
| latents[:, cond_indices_images, 0] = cond_latents | |
| latents[:, cond_indices_rays, 1] = cond_rays | |
| latents = einops.rearrange(latents, "b f n c h w -> b (f n) c h w") | |
| else: | |
| if cond_latents is not None: | |
| latents[:, cond_indices] = cond_latents | |
| # denoising loop | |
| num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order | |
| with self.progress_bar(total=num_inference_steps) as progress_bar: | |
| for i, t in enumerate(timesteps): | |
| # expand the latents if we are doing classifier free guidance | |
| latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents | |
| input_t = torch.broadcast_to(einops.repeat(torch.Tensor([t]).to(device), "1 -> 1 f 1 1 1", f=latent_model_input.shape[1]), latent_model_input.shape).clone() | |
| if is_multiview: | |
| input_t = einops.rearrange(input_t, "b (f n) c h w -> b f n c h w", n=2) | |
| input_t[:, cond_indices_images, 0] = self.scheduler.timesteps[-1] | |
| input_t[:, cond_indices_rays, 1] = self.scheduler.timesteps[-1] | |
| input_t = einops.rearrange(input_t, "b f n c h w -> b (f n) c h w") | |
| else: | |
| input_t[:, cond_indices] = self.scheduler.timesteps[-1] | |
| # predict the noise residual | |
| noise_pred = self.transformer( | |
| samples=latent_model_input.to(self.dtype), | |
| timesteps=input_t, | |
| encoder_hidden_states=encoder_hidden_states.to(self.dtype), | |
| encoder_attention_mask=encoder_attention_mask, | |
| **forward_kwargs | |
| ) | |
| # perform guidance | |
| if do_classifier_free_guidance: | |
| noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) | |
| noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) | |
| # compute the previous noisy sample x_t -> x_t-1 | |
| bs, n_frame = noise_pred.shape[:2] | |
| noise_pred = einops.rearrange(noise_pred, "b f c h w -> (b f) c h w") | |
| latents = einops.rearrange(latents, "b f c h w -> (b f) c h w") | |
| latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample | |
| latents = einops.rearrange(latents, "(b f) c h w -> b f c h w", b=bs, f=n_frame) | |
| if is_multiview: | |
| latents = einops.rearrange(latents, "b (f n) c h w -> b f n c h w", n=2) | |
| if cond_latents is not None: | |
| latents[:, cond_indices_images, 0] = cond_latents | |
| latents[:, cond_indices_rays, 1] = cond_rays | |
| latents = einops.rearrange(latents, "b f n c h w -> b (f n) c h w") | |
| else: | |
| if cond_latents is not None: | |
| latents[:, cond_indices] = cond_latents | |
| # call the callback, if provided | |
| if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): | |
| progress_bar.update() | |
| if callback is not None and i % callback_steps == 0: | |
| callback(i, t, latents) | |
| decoded_latents = latents / 1.658 | |
| # scale and decode the image latents with vae | |
| latents = 1 / self.vae.config.scaling_factor * latents | |
| if latents.ndim == 5: | |
| latents = latents[:, denoise_indices] | |
| latents = einops.rearrange(latents, "b f c h w -> (b f) c h w") | |
| image = self.vae.decode(latents.to(self.vae.dtype)).sample | |
| image = (image / 2 + 0.5).clamp(0, 1) | |
| image = image.cpu().permute(0, 2, 3, 1).float().numpy() | |
| if output_type == "pil": | |
| image = self.numpy_to_pil(image) | |
| if not return_dict: | |
| return (image, None) | |
| return OneDiffusionPipelineOutput(images=image, latents=decoded_latents) | |
| def prepare_extra_step_kwargs(self, generator, eta): | |
| # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature | |
| # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. | |
| # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 | |
| # and should be between [0, 1] | |
| accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) | |
| extra_step_kwargs = {} | |
| if accepts_eta: | |
| extra_step_kwargs["eta"] = eta | |
| # check if the scheduler accepts generator | |
| accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) | |
| if accepts_generator: | |
| extra_step_kwargs["generator"] = generator | |
| return extra_step_kwargs | |
| def check_inputs(self, prompt, height, width, callback_steps): | |
| if not isinstance(prompt, str) and not isinstance(prompt, list): | |
| raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") | |
| if height % 16 != 0 or width % 16 != 0: | |
| raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.") | |
| if (callback_steps is None) or ( | |
| callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) | |
| ): | |
| raise ValueError( | |
| f"`callback_steps` has to be a positive integer but is {callback_steps} of type" | |
| f" {type(callback_steps)}." | |
| ) | |
| def get_timesteps(self, num_inference_steps, strength, device): | |
| # get the original timestep using init_timestep | |
| init_timestep = min(int(num_inference_steps * strength), num_inference_steps) | |
| t_start = max(num_inference_steps - init_timestep, 0) | |
| timesteps = self.scheduler.timesteps[t_start:] | |
| return timesteps, num_inference_steps - t_start | |
| def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None, image=None): | |
| shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) | |
| if isinstance(generator, list) and len(generator) != batch_size: | |
| raise ValueError( | |
| f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" | |
| f" size of {batch_size}. Make sure the batch size matches the length of the generators." | |
| ) | |
| if latents is None: | |
| latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) | |
| else: | |
| latents = latents.to(device) | |
| if image is None: | |
| # scale the initial noise by the standard deviation required by the scheduler | |
| # latents = latents * self.scheduler.init_noise_sigma | |
| return latents | |
| image = image.to(device=device, dtype=dtype) | |
| if isinstance(generator, list) and len(generator) != batch_size: | |
| raise ValueError( | |
| f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" | |
| f" size of {batch_size}. Make sure the batch size matches the length of the generators." | |
| ) | |
| elif isinstance(generator, list): | |
| if image.shape[0] < batch_size and batch_size % image.shape[0] == 0: | |
| image = torch.cat([image] * (batch_size // image.shape[0]), dim=0) | |
| elif image.shape[0] < batch_size and batch_size % image.shape[0] != 0: | |
| raise ValueError( | |
| f"Cannot duplicate `image` of batch size {image.shape[0]} to effective batch_size {batch_size} " | |
| ) | |
| init_latents = [ | |
| retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i]) | |
| for i in range(batch_size) | |
| ] | |
| init_latents = torch.cat(init_latents, dim=0) | |
| else: | |
| init_latents = retrieve_latents(self.vae.encode(image.to(self.vae.dtype)), generator=generator) | |
| init_latents = self.vae.config.scaling_factor * init_latents | |
| init_latents = init_latents.to(device=device, dtype=dtype) | |
| init_latents = einops.rearrange(init_latents, "(bs views) c h w -> bs views c h w", bs=batch_size, views=init_latents.shape[0]//batch_size) | |
| # latents = einops.rearrange(latents, "b c h w -> b 1 c h w") | |
| # latents = torch.concat([latents, init_latents], dim=1) | |
| return init_latents | |
| def prepare_init_latents(self, batch_size, seq_length, num_channels_latents, height, width, dtype, device, generator, latents=None): | |
| shape = (batch_size, seq_length, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) | |
| if isinstance(generator, list) and len(generator) != batch_size: | |
| raise ValueError( | |
| f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" | |
| f" size of {batch_size}. Make sure the batch size matches the length of the generators." | |
| ) | |
| if latents is None: | |
| latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) | |
| else: | |
| latents = latents.to(device) | |
| return latents | |
| def generate( | |
| self, | |
| prompt: Union[str, List[str]], | |
| num_inference_steps: int = 50, | |
| guidance_scale: float = 5.0, | |
| negative_prompt: Optional[Union[str, List[str]]] = None, | |
| num_images_per_prompt: Optional[int] = 1, | |
| height: Optional[int] = None, | |
| width: Optional[int] = None, | |
| eta: float = 0.0, | |
| generator: Optional[torch.Generator] = None, | |
| latents: Optional[torch.FloatTensor] = None, | |
| output_type: Optional[str] = "pil", | |
| return_dict: bool = True, | |
| callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, | |
| callback_steps: Optional[int] = 1, | |
| ): | |
| """ | |
| Function for image generation using the OneDiffusionPipeline. | |
| """ | |
| return self( | |
| prompt=prompt, | |
| num_inference_steps=num_inference_steps, | |
| guidance_scale=guidance_scale, | |
| negative_prompt=negative_prompt, | |
| num_images_per_prompt=num_images_per_prompt, | |
| height=height, | |
| width=width, | |
| eta=eta, | |
| generator=generator, | |
| latents=latents, | |
| output_type=output_type, | |
| return_dict=return_dict, | |
| callback=callback, | |
| callback_steps=callback_steps, | |
| ) | |
| def numpy_to_pil(images): | |
| """ | |
| Convert a numpy image or a batch of images to a PIL image. | |
| """ | |
| if images.ndim == 3: | |
| images = images[None, ...] | |
| images = (images * 255).round().astype("uint8") | |
| if images.shape[-1] == 1: | |
| # special case for grayscale (single channel) images | |
| pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images] | |
| else: | |
| pil_images = [Image.fromarray(image) for image in images] | |
| return pil_images | |
| def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): | |
| model_path = pretrained_model_name_or_path | |
| cache_dir = kwargs.pop("cache_dir", None) | |
| force_download = kwargs.pop("force_download", False) | |
| proxies = kwargs.pop("proxies", None) | |
| local_files_only = kwargs.pop("local_files_only", None) | |
| token = kwargs.pop("token", None) | |
| revision = kwargs.pop("revision", None) | |
| from_flax = kwargs.pop("from_flax", False) | |
| torch_dtype = kwargs.pop("torch_dtype", None) | |
| custom_pipeline = kwargs.pop("custom_pipeline", None) | |
| custom_revision = kwargs.pop("custom_revision", None) | |
| provider = kwargs.pop("provider", None) | |
| sess_options = kwargs.pop("sess_options", None) | |
| device_map = kwargs.pop("device_map", None) | |
| max_memory = kwargs.pop("max_memory", None) | |
| offload_folder = kwargs.pop("offload_folder", None) | |
| offload_state_dict = kwargs.pop("offload_state_dict", False) | |
| low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT) | |
| variant = kwargs.pop("variant", None) | |
| use_safetensors = kwargs.pop("use_safetensors", None) | |
| use_onnx = kwargs.pop("use_onnx", None) | |
| load_connected_pipeline = kwargs.pop("load_connected_pipeline", False) | |
| if low_cpu_mem_usage and not is_accelerate_available(): | |
| low_cpu_mem_usage = False | |
| logger.warning( | |
| "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the" | |
| " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install" | |
| " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip" | |
| " install accelerate\n```\n." | |
| ) | |
| if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"): | |
| raise NotImplementedError( | |
| "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set" | |
| " `low_cpu_mem_usage=False`." | |
| ) | |
| if device_map is not None and not is_torch_version(">=", "1.9.0"): | |
| raise NotImplementedError( | |
| "Loading and dispatching requires torch >= 1.9.0. Please either update your PyTorch version or set" | |
| " `device_map=None`." | |
| ) | |
| if device_map is not None and not is_accelerate_available(): | |
| raise NotImplementedError( | |
| "Using `device_map` requires the `accelerate` library. Please install it using: `pip install accelerate`." | |
| ) | |
| if device_map is not None and not isinstance(device_map, str): | |
| raise ValueError("`device_map` must be a string.") | |
| if device_map is not None and device_map not in SUPPORTED_DEVICE_MAP: | |
| raise NotImplementedError( | |
| f"{device_map} not supported. Supported strategies are: {', '.join(SUPPORTED_DEVICE_MAP)}" | |
| ) | |
| if device_map is not None and device_map in SUPPORTED_DEVICE_MAP: | |
| if is_accelerate_version("<", "0.28.0"): | |
| raise NotImplementedError("Device placement requires `accelerate` version `0.28.0` or later.") | |
| if low_cpu_mem_usage is False and device_map is not None: | |
| raise ValueError( | |
| f"You cannot set `low_cpu_mem_usage` to False while using device_map={device_map} for loading and" | |
| " dispatching. Please make sure to set `low_cpu_mem_usage=True`." | |
| ) | |
| transformer = NextDiT.from_pretrained(f"{model_path}", subfolder="transformer", torch_dtype=torch.float32, cache_dir=cache_dir) | |
| vae = AutoencoderKL.from_pretrained(f"{model_path}", subfolder="vae", cache_dir=cache_dir) | |
| text_encoder = T5EncoderModel.from_pretrained(f"{model_path}", subfolder="text_encoder", torch_dtype=torch.float16, cache_dir=cache_dir) | |
| tokenizer = T5Tokenizer.from_pretrained(model_path, subfolder="tokenizer", cache_dir=cache_dir) | |
| scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(model_path, subfolder="scheduler", cache_dir=cache_dir) | |
| pipeline = cls( | |
| transformer=transformer, | |
| vae=vae, | |
| text_encoder=text_encoder, | |
| tokenizer=tokenizer, | |
| scheduler=scheduler, | |
| **kwargs | |
| ) | |
| return pipeline |