Spaces:

Bai-YT
/

ConsistencyTTA

Sleeping

File size: 8,430 Bytes

66982e9

import torch
from torch import nn, Tensor
from transformers import AutoTokenizer, T5EncoderModel

from diffusers.utils.torch_utils import randn_tensor
from diffusers import UNet2DConditionGuidedModel, HeunDiscreteScheduler
from audioldm.stft import TacotronSTFT
from audioldm.variational_autoencoder import AutoencoderKL
from audioldm.utils import default_audioldm_config


class ConsistencyTTA(nn.Module):

    def __init__(self):
        super().__init__()

        # Initialize the consistency U-Net
        unet_model_config_path='tango_diffusion_light.json'
        unet_config = UNet2DConditionGuidedModel.load_config(unet_model_config_path)
        self.unet = UNet2DConditionGuidedModel.from_config(unet_config, subfolder="unet")

        unet_weight_path = "consistencytta_clapft_ckpt/unet_state_dict.pt"
        unet_weight_sd = torch.load(unet_weight_path, map_location='cpu')
        self.unet.load_state_dict(unet_weight_sd)

        # Initialize FLAN-T5 tokenizer and text encoder
        text_encoder_name = 'google/flan-t5-large'
        self.tokenizer = AutoTokenizer.from_pretrained(text_encoder_name)
        self.text_encoder = T5EncoderModel.from_pretrained(text_encoder_name)
        self.text_encoder.eval(); self.text_encoder.requires_grad_(False)

        # Initialize the VAE
        raw_vae_path = "consistencytta_clapft_ckpt/vae_state_dict.pt"
        raw_vae_sd = torch.load(raw_vae_path, map_location="cpu")
        vae_state_dict, scale_factor = raw_vae_sd["state_dict"], raw_vae_sd["scale_factor"]

        config = default_audioldm_config('audioldm-s-full')
        vae_config = config["model"]["params"]["first_stage_config"]["params"]
        vae_config["scale_factor"] = scale_factor

        self.vae = AutoencoderKL(**vae_config)
        self.vae.load_state_dict(vae_state_dict)
        self.vae.eval(); self.vae.requires_grad_(False)

        # Initialize the STFT
        self.fn_STFT = TacotronSTFT(
            config["preprocessing"]["stft"]["filter_length"],  # default 1024
            config["preprocessing"]["stft"]["hop_length"],  # default 160
            config["preprocessing"]["stft"]["win_length"],  # default 1024
            config["preprocessing"]["mel"]["n_mel_channels"],  # default 64
            config["preprocessing"]["audio"]["sampling_rate"],  # default 16000
            config["preprocessing"]["mel"]["mel_fmin"],  # default 0
            config["preprocessing"]["mel"]["mel_fmax"],  # default 8000
        )
        self.fn_STFT.eval(); self.fn_STFT.requires_grad_(False)

        self.scheduler = HeunDiscreteScheduler.from_pretrained(
            pretrained_model_name_or_path='stabilityai/stable-diffusion-2-1', subfolder="scheduler"
        )


    def train(self, mode: bool = True):
        self.unet.train(mode)
        for model in [self.text_encoder, self.vae, self.fn_STFT]:
            model.eval()
        return self


    def eval(self):
        return self.train(mode=False)


    def check_eval_mode(self):
        for model, name in zip(
            [self.text_encoder, self.vae, self.fn_STFT, self.unet],
            ['text_encoder', 'vae', 'fn_STFT', 'unet']
        ):
            assert model.training == False, f"The {name} is not in eval mode."
            for param in model.parameters():
                assert param.requires_grad == False, f"The {name} is not frozen."


    @torch.no_grad()
    def encode_text(self, prompt, max_length=None, padding=True):
        device = self.text_encoder.device
        if max_length is None:
            max_length = self.tokenizer.model_max_length

        batch = self.tokenizer(
            prompt, max_length=max_length, padding=padding,
            truncation=True, return_tensors="pt"
        )
        input_ids = batch.input_ids.to(device)
        attention_mask = batch.attention_mask.to(device)

        prompt_embeds = self.text_encoder(
            input_ids=input_ids, attention_mask=attention_mask
        )[0]
        bool_prompt_mask = (attention_mask == 1).to(device)  # Convert to boolean
        return prompt_embeds, bool_prompt_mask


    @torch.no_grad()
    def encode_text_classifier_free(self, prompt: str, num_samples_per_prompt: int):
        # get conditional embeddings
        cond_prompt_embeds, cond_prompt_mask = self.encode_text(prompt)
        cond_prompt_embeds = cond_prompt_embeds.repeat_interleave(
            num_samples_per_prompt, 0
        )
        cond_prompt_mask = cond_prompt_mask.repeat_interleave(
            num_samples_per_prompt, 0
        )

        # get unconditional embeddings for classifier free guidance
        uncond_tokens = [""] * len(prompt)
        negative_prompt_embeds, uncond_prompt_mask = self.encode_text(
            uncond_tokens, max_length=cond_prompt_embeds.shape[1], padding="max_length"
        )
        negative_prompt_embeds = negative_prompt_embeds.repeat_interleave(
            num_samples_per_prompt, 0
        )
        uncond_prompt_mask = uncond_prompt_mask.repeat_interleave(
            num_samples_per_prompt, 0
        )

        """ For classifier-free guidance, we need to do two forward passes.
            We concatenate the unconditional and text embeddings into a single batch 
        """
        prompt_embeds = torch.cat([negative_prompt_embeds, cond_prompt_embeds])
        prompt_mask = torch.cat([uncond_prompt_mask, cond_prompt_mask])

        return prompt_embeds, prompt_mask, cond_prompt_embeds, cond_prompt_mask


    def forward(
        self, prompt: str, cfg_scale_input: float = 3., cfg_scale_post: float = 1.,
        num_steps: int = 1, num_samples: int = 1, sr: int = 16000
    ):
        self.check_eval_mode()
        device = self.text_encoder.device
        use_cf_guidance = cfg_scale_post > 1.

        # Get prompt embeddings
        prompt_embeds_cf, prompt_mask_cf, prompt_embeds, prompt_mask = \
            self.encode_text_classifier_free(prompt, num_samples)
        encoder_states, encoder_att_mask = \
            (prompt_embeds_cf, prompt_mask_cf) if use_cf_guidance \
                else (prompt_embeds, prompt_mask)

        # Prepare noise
        num_channels_latents = self.unet.config.in_channels
        latent_shape = (len(prompt) * num_samples, num_channels_latents, 256, 16)
        noise = randn_tensor(
            latent_shape, generator=None, device=device, dtype=prompt_embeds.dtype
        )

        # Query the inference scheduler to obtain the time steps.
        # The time steps spread between 0 and training time steps
        self.scheduler.set_timesteps(18, device=device)  # Set this to training steps first
        z_N = noise * self.scheduler.init_noise_sigma

        def calc_zhat_0(z_n: Tensor, t: int):
            """ Query the consistency model to get zhat_0, which is the denoised embedding.
            Args:
                z_n (Tensor):   The noisy embedding.
                t (int):        The time step.
            Returns:
                Tensor:         The denoised embedding.
            """
            # expand the latents if we are doing classifier free guidance
            z_n_input = torch.cat([z_n] * 2) if use_cf_guidance else z_n
            # Scale model input as required for some schedules.
            z_n_input = self.scheduler.scale_model_input(z_n_input, t)

            # Get zhat_0 from the model
            zhat_0 = self.unet(
                z_n_input, t, guidance=cfg_scale_input,
                encoder_hidden_states=encoder_states, encoder_attention_mask=encoder_att_mask
            ).sample

            # Perform external classifier-free guidance
            if use_cf_guidance:
                zhat_0_uncond, zhat_0_cond = zhat_0.chunk(2)
                zhat_0 = (1 - cfg_scale_post) * zhat_0_uncond + cfg_scale_post * zhat_0_cond

            return zhat_0

        # Query the consistency model
        zhat_0 = calc_zhat_0(z_N, self.scheduler.timesteps[0])

        # Iteratively query the consistency model if requested
        self.scheduler.set_timesteps(num_steps, device=device)

        for t in self.scheduler.timesteps[1::2]:  # 2 is the order of the scheduler
            zhat_n = self.scheduler.add_noise(zhat_0, torch.randn_like(zhat_0), t)
            # Calculate new zhat_0
            zhat_0 = calc_zhat_0(zhat_n, t)

        mel = self.vae.decode_first_stage(zhat_0.float())
        return self.vae.decode_to_waveform(mel)[:, :int(sr * 9.5)]  # Truncate to 9.6 seconds