Spaces:

AIGC-Audio
/

AudioLCM

Running on Zero

File size: 24,949 Bytes

6efc863

"""SAMPLING ONLY."""

import torch
import numpy as np
from tqdm import tqdm
from functools import partial
from typing import List, Optional, Tuple, Union
from ldm.util import randn_tensor
from ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like, \
    extract_into_tensor


class LCMSampler(object):
    def __init__(self, model, **kwargs):
        super().__init__()
        self.model = model
        self.ddpm_num_timesteps = model.num_timesteps
        self.original_inference_steps = 100
        # setable values
        self.num_inference_steps = None
        self.timesteps = torch.from_numpy(np.arange(0, self.ddpm_num_timesteps)[::-1].copy().astype(np.int64))
        self.custom_timesteps = False
        self.timestep_scaling = 10.0
        self.prediction_type = 'epsilon'


    def register_buffer(self, name, attr):
        if type(attr) == torch.Tensor:
            if attr.device != torch.device("cuda"):
                attr = attr.to(torch.device("cuda"))
        setattr(self, name, attr)

    def make_schedule(self, ddim_discretize="uniform", verbose=True):
        # self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
        #                                           num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose)
        # alphas_cumprod = self.model.alphas_cumprod
        # assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
        # beta_start = 0.00085
        # beta_end = 0.012
        # self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, self.ddpm_num_timesteps, dtype=torch.float32) ** 2
        # self.alphas = 1.0 - self.betas
        # self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
        alphas_cumprod = self.model.alphas_cumprod
        assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
        to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
        self.register_buffer('betas', to_torch(self.model.betas))
        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
        self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev))

        # # calculations for diffusion q(x_t | x_{t-1}) and others
        # self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
        # self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
        # self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
        # self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
        # self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))

        

        # # ddim sampling parameters
        # ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
        #                                                                            ddim_timesteps=self.ddim_timesteps,
        #                                                                            eta=ddim_eta,verbose=verbose)
        # self.register_buffer('ddim_sigmas', ddim_sigmas)
        # self.register_buffer('ddim_alphas', ddim_alphas)
        # self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
        # self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
        # sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
        #     (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
        #                 1 - self.alphas_cumprod / self.alphas_cumprod_prev))
        # self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)

    def progress_bar(self, iterable=None, total=None):
        if not hasattr(self, "_progress_bar_config"):
            self._progress_bar_config = {}
        elif not isinstance(self._progress_bar_config, dict):
            raise ValueError(
                f"`self._progress_bar_config` should be of type `dict`, but is {type(self._progress_bar_config)}."
            )

        if iterable is not None:
            return tqdm(iterable, **self._progress_bar_config)
        elif total is not None:
            return tqdm(total=total, **self._progress_bar_config)
        else:
            raise ValueError("Either `total` or `iterable` has to be defined.")
        
    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
        """

        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298



        Args:

            timesteps (`torch.Tensor`):

                generate embedding vectors at these timesteps

            embedding_dim (`int`, *optional*, defaults to 512):

                dimension of the embeddings to generate

            dtype:

                data type of the generated embeddings



        Returns:

            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`

        """
        assert len(w.shape) == 1
        w = w * 1000.0

        half_dim = embedding_dim // 2
        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
        emb = w.to(dtype)[:, None] * emb[None, :]
        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
        if embedding_dim % 2 == 1:  # zero pad
            emb = torch.nn.functional.pad(emb, (0, 1))
        assert emb.shape == (w.shape[0], embedding_dim)
        return emb

    @property
    def step_index(self):
        return self._step_index

    def set_timesteps(

        self,

        num_inference_steps: Optional[int] = None,

        device: Union[str, torch.device] = None,

        original_inference_steps: Optional[int] = None,

        timesteps: Optional[List[int]] = None,

        strength: int = 1.0,

    ):
        """

        Sets the discrete timesteps used for the diffusion chain (to be run before inference).



        Args:

            num_inference_steps (`int`, *optional*):

                The number of diffusion steps used when generating samples with a pre-trained model. If used,

                `timesteps` must be `None`.

            device (`str` or `torch.device`, *optional*):

                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.

            original_inference_steps (`int`, *optional*):

                The original number of inference steps, which will be used to generate a linearly-spaced timestep

                schedule (which is different from the standard `diffusers` implementation). We will then take

                `num_inference_steps` timesteps from this schedule, evenly spaced in terms of indices, and use that as

                our final timestep schedule. If not set, this will default to the `original_inference_steps` attribute.

            timesteps (`List[int]`, *optional*):

                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default

                timestep spacing strategy of equal spacing between timesteps on the training/distillation timestep

                schedule is used. If `timesteps` is passed, `num_inference_steps` must be `None`.

        """
        # 0. Check inputs
        if num_inference_steps is None and timesteps is None:
            raise ValueError("Must pass exactly one of `num_inference_steps` or `custom_timesteps`.")

        if num_inference_steps is not None and timesteps is not None:
            raise ValueError("Can only pass one of `num_inference_steps` or `custom_timesteps`.")

        # 1. Calculate the LCM original training/distillation timestep schedule.
        original_steps = (
            original_inference_steps if original_inference_steps is not None else self.original_inference_steps
        )

        if original_steps > self.ddpm_num_timesteps:
            raise ValueError(
                f"`original_steps`: {original_steps} cannot be larger than `self.config.train_timesteps`:"
                f" {self.ddpm_num_timesteps} as the unet model trained with this scheduler can only handle"
                f" maximal {self.ddpm_num_timesteps} timesteps."
            )
        # import ipdb
        # ipdb.set_trace()
        # LCM Timesteps Setting
        # The skipping step parameter k from the paper.
        k = self.ddpm_num_timesteps // original_steps
        # LCM Training/Distillation Steps Schedule
        # Currently, only a linearly-spaced schedule is supported (same as in the LCM distillation scripts).
        lcm_origin_timesteps = np.asarray(list(range(1, int(original_steps * strength) + 1))) * k - 1

        # 2. Calculate the LCM inference timestep schedule.
        if timesteps is not None:
            # 2.1 Handle custom timestep schedules.
            train_timesteps = set(lcm_origin_timesteps)
            non_train_timesteps = []
            for i in range(1, len(timesteps)):
                if timesteps[i] >= timesteps[i - 1]:
                    raise ValueError("`custom_timesteps` must be in descending order.")

                if timesteps[i] not in train_timesteps:
                    non_train_timesteps.append(timesteps[i])

            if timesteps[0] >= self.ddpm_num_timesteps:
                raise ValueError(
                    f"`timesteps` must start before `self.config.train_timesteps`:"
                    f" {self.ddpm_num_timesteps}."
                )

            # Raise warning if timestep schedule does not start with self.config.num_train_timesteps - 1
            if strength == 1.0 and timesteps[0] != self.ddpm_num_timesteps - 1:
                logger.warning(
                    f"The first timestep on the custom timestep schedule is {timesteps[0]}, not"
                    f" `self.ddpm_num_timesteps - 1`: {self.ddpm_num_timesteps - 1}. You may get"
                    f" unexpected results when using this timestep schedule."
                )

            # Raise warning if custom timestep schedule contains timesteps not on original timestep schedule
            if non_train_timesteps:
                logger.warning(
                    f"The custom timestep schedule contains the following timesteps which are not on the original"
                    f" training/distillation timestep schedule: {non_train_timesteps}. You may get unexpected results"
                    f" when using this timestep schedule."
                )

            # Raise warning if custom timestep schedule is longer than original_steps
            if len(timesteps) > original_steps:
                logger.warning(
                    f"The number of timesteps in the custom timestep schedule is {len(timesteps)}, which exceeds the"
                    f" the length of the timestep schedule used for training: {original_steps}. You may get some"
                    f" unexpected results when using this timestep schedule."
                )

            timesteps = np.array(timesteps, dtype=np.int64)
            self.num_inference_steps = len(timesteps)
            self.custom_timesteps = True

            # Apply strength (e.g. for img2img pipelines) (see StableDiffusionImg2ImgPipeline.get_timesteps)
            init_timestep = min(int(self.num_inference_steps * strength), self.num_inference_steps)
            t_start = max(self.num_inference_steps - init_timestep, 0)
            timesteps = timesteps[t_start * self.order :]
            # TODO: also reset self.num_inference_steps?
        else:
            # 2.2 Create the "standard" LCM inference timestep schedule.
            if num_inference_steps > self.ddpm_num_timesteps:
                raise ValueError(
                    f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.ddpm_num_timesteps`:"
                    f" {self.ddpm_num_timesteps} as the unet model trained with this scheduler can only handle"
                    f" maximal {self.ddpm_num_timesteps} timesteps."
                )

            skipping_step = len(lcm_origin_timesteps) // num_inference_steps

            if skipping_step < 1:
                raise ValueError(
                    f"The combination of `original_steps x strength`: {original_steps} x {strength} is smaller than `num_inference_steps`: {num_inference_steps}. Make sure to either reduce `num_inference_steps` to a value smaller than {int(original_steps * strength)} or increase `strength` to a value higher than {float(num_inference_steps / original_steps)}."
                )

            self.num_inference_steps = num_inference_steps

            if num_inference_steps > original_steps:
                raise ValueError(
                    f"`num_inference_steps`: {num_inference_steps} cannot be larger than `original_inference_steps`:"
                    f" {original_steps} because the final timestep schedule will be a subset of the"
                    f" `original_inference_steps`-sized initial timestep schedule."
                )

            # LCM Inference Steps Schedule
            lcm_origin_timesteps = lcm_origin_timesteps[::-1].copy()
            # Select (approximately) evenly spaced indices from lcm_origin_timesteps.
            inference_indices = np.linspace(0, len(lcm_origin_timesteps), num=num_inference_steps, endpoint=False)
            inference_indices = np.floor(inference_indices).astype(np.int64)
            timesteps = lcm_origin_timesteps[inference_indices]

        self.timesteps = torch.from_numpy(timesteps).to(device=device, dtype=torch.long)

        self._step_index = None

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
    def retrieve_timesteps(

        self,

        num_inference_steps: Optional[int] = None,

        device: Optional[Union[str, torch.device]] = None,

        timesteps: Optional[List[int]] = None,

        **kwargs,

    ):
        """

        Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles

        custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.



        Args:

            scheduler (`SchedulerMixin`):

                The scheduler to get timesteps from.

            num_inference_steps (`int`):

                The number of diffusion steps used when generating samples with a pre-trained model. If used,

                `timesteps` must be `None`.

            device (`str` or `torch.device`, *optional*):

                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.

            timesteps (`List[int]`, *optional*):

                    Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default

                    timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`

                    must be `None`.



        Returns:

            `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the

            second element is the number of inference steps.

        """
        if timesteps is not None:
            self.set_timesteps(timesteps=timesteps, device=device, **kwargs)
            timesteps = self.timesteps
            num_inference_steps = len(timesteps)
        else:
            self.set_timesteps(num_inference_steps, device=device, **kwargs)
            timesteps = self.timesteps
        return timesteps, num_inference_steps   

    @torch.no_grad()
    def sample(self,

               S,

               batch_size,

               shape,

               conditioning=None,

               callback=None,

               normals_sequence=None,

               img_callback=None,

               verbose=True,

               x_T=None,

               guidance_scale=5.,

               original_inference_steps=50,

               timesteps=None,

               # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...

               **kwargs

               ):
        if conditioning is not None:
            if isinstance(conditioning, dict):
                ctmp = conditioning[list(conditioning.keys())[0]]
                while isinstance(ctmp, list): ctmp = ctmp[0]
                cbs = ctmp.shape[0]
                if cbs != batch_size:
                    print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
            else:
                if conditioning.shape[0] != batch_size:
                    print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")

        self.make_schedule(verbose=verbose)
        self.num_inference_steps = S
        # sampling
        if len(shape)==3:
            C, H, W = shape
            size = (batch_size, C, H, W)
        else:
            C, T = shape
            size = (batch_size, C, T) 

        samples, intermediates = self.lcm_sampling(conditioning, size,
                                                    x_T=x_T,
                                                    guidance_scale=guidance_scale,
                                                    original_inference_steps=original_inference_steps,
                                                    timesteps=timesteps
                                                    )
        return samples, intermediates

    @torch.no_grad()
    def lcm_sampling(self, cond, shape,

                      x_T=None,

                      guidance_scale=1.,original_inference_steps=100,timesteps=None):
        device = self.model.betas.device
        timesteps, num_inference_steps = self.retrieve_timesteps(
            self.num_inference_steps, device, timesteps, original_inference_steps=original_inference_steps
        )
        b = shape[0]
        if x_T is None:
            img = torch.randn(shape, device=device)
        else:
            img = x_T

        
        w = torch.tensor(guidance_scale - 1).repeat(b)
        w_embedding = self.get_guidance_scale_embedding(w, embedding_dim=256).to(
            device=device, dtype=img.dtype
        )
        
        # import ipdb
        # ipdb.set_trace()
        # 8. LCM MultiStep Sampling Loop:
        num_warmup_steps = len(timesteps) - num_inference_steps
        self._num_timesteps = len(timesteps)
        with self.progress_bar(total=num_inference_steps) as progress_bar:
            for i, t in enumerate(timesteps):
                img = img.to(cond.dtype)
                ts = torch.full((b,), t, device=device, dtype=torch.long)
                # model prediction (v-prediction, eps, x)
                model_pred = self.model.apply_model(img, ts, cond,self.model.unet, w_cond=w_embedding)

                # compute the previous noisy sample x_t -> x_t-1
                img, denoised = self.step(model_pred, t, img, return_dict=False)

                # call the callback, if provided
                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps):
                    progress_bar.update()
        return denoised, img

    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
    def _init_step_index(self, timestep):
        if isinstance(timestep, torch.Tensor):
            timestep = timestep.to(self.timesteps.device)

        index_candidates = (self.timesteps == timestep).nonzero()

        # The sigma index that is taken for the **very** first `step`
        # is always the second index (or the last index if there is only 1)
        # This way we can ensure we don't accidentally skip a sigma in
        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
        if len(index_candidates) > 1:
            step_index = index_candidates[1]
        else:
            step_index = index_candidates[0]

        self._step_index = step_index.item()

    def get_scalings_for_boundary_condition_discrete(self, timestep):
        self.sigma_data = 0.5  # Default: 0.5
        scaled_timestep = timestep * self.timestep_scaling

        c_skip = self.sigma_data**2 / (scaled_timestep**2 + self.sigma_data**2)
        c_out = scaled_timestep / (scaled_timestep**2 + self.sigma_data**2) ** 0.5
        return c_skip, c_out

    @torch.no_grad()
    def step(

        self,

        model_output: torch.FloatTensor,

        timestep: int,

        sample: torch.FloatTensor,

        generator: Optional[torch.Generator] = None,

        return_dict: bool = True,

    ):
        """

        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion

        process from the learned model outputs (most often the predicted noise).



        Args:

            model_output (`torch.FloatTensor`):

                The direct output from learned diffusion model.

            timestep (`float`):

                The current discrete timestep in the diffusion chain.

            sample (`torch.FloatTensor`):

                A current instance of a sample created by the diffusion process.

            generator (`torch.Generator`, *optional*):

                A random number generator.

            return_dict (`bool`, *optional*, defaults to `True`):

                Whether or not to return a [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] or `tuple`.

        Returns:

            [`~schedulers.scheduling_utils.LCMSchedulerOutput`] or `tuple`:

                If return_dict is `True`, [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] is returned, otherwise a

                tuple is returned where the first element is the sample tensor.

        """
        if self.num_inference_steps is None:
            raise ValueError(
                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
            )
        
        if self.step_index is None:
            self._init_step_index(timestep)
        # 1. get previous step value
        prev_step_index = self.step_index + 1
        if prev_step_index < len(self.timesteps):
            prev_timestep = self.timesteps[prev_step_index]
        else:
            prev_timestep = timestep

        # 2. compute alphas, betas
        alpha_prod_t = self.alphas_cumprod[timestep]
        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else torch.tensor(1.0)

        beta_prod_t = 1 - alpha_prod_t
        beta_prod_t_prev = 1 - alpha_prod_t_prev

        # 3. Get scalings for boundary conditions

        c_skip, c_out = self.get_scalings_for_boundary_condition_discrete(timestep)

        # 4. Compute the predicted original sample x_0 based on the model parameterization
        if self.prediction_type == "epsilon":  # noise-prediction
            predicted_original_sample = (sample - beta_prod_t.sqrt() * model_output) / alpha_prod_t.sqrt()
        elif self.prediction_type == "sample":  # x-prediction
            predicted_original_sample = model_output
        elif self.prediction_type == "v_prediction":  # v-prediction
            predicted_original_sample = alpha_prod_t.sqrt() * sample - beta_prod_t.sqrt() * model_output
        else:
            raise ValueError(
                f"prediction_type given as {self.prediction_type} must be one of `epsilon`, `sample` or"
                " `v_prediction` for `LCMScheduler`."
            )


        # 5. Denoise model output using boundary conditions
        denoised = c_out * predicted_original_sample + c_skip * sample

        # 7. Sample and inject noise z ~ N(0, I) for MultiStep Inference
        # Noise is not used on the final timestep of the timestep schedule.
        # This also means that noise is not used for one-step sampling.
        if self.step_index != self.num_inference_steps - 1:
            noise = torch.randn(model_output.shape, device=model_output.device)
            prev_sample = alpha_prod_t_prev.sqrt() * denoised + beta_prod_t_prev.sqrt() * noise
        else:
            prev_sample = denoised

        # upon completion increase step index by one
        self._step_index += 1

        if not return_dict:
            return (prev_sample, denoised)

        return prev_sample, denoised