Spaces:

fantaxy
/

tango2

Running

App Files Files Community

hungchiayu1 commited on May 6

Commit

df31906

•

1 Parent(s): 86a3494

Created tango2 pipeline

Browse files

Files changed (1) hide show

app.py +172 -4

app.py CHANGED Viewed

@@ -11,6 +11,165 @@ from pydub import AudioSegment
 from gradio import Markdown
 import spaces
 # Automatic device detection
 if torch.cuda.is_available():
     device_type = "cuda"
@@ -79,13 +238,22 @@ class Tango:
 # Initialize TANGO
 tango = Tango(device="cpu")
-tango.vae.to(device_type)
-tango.stft.to(device_type)
-tango.model.to(device_type)
 @spaces.GPU(duration=60)
 def gradio_generate(prompt, output_format, steps, guidance):
-    output_wave = tango.generate(prompt, steps, guidance)
     # output_filename = f"{prompt.replace(' ', '_')}_{steps}_{guidance}"[:250] + ".wav"
     output_filename = "temp.wav"
     wavio.write(output_filename, output_wave, rate=16000, sampwidth=2)

 from gradio import Markdown
 import spaces
+import torch
+from diffusers.models.autoencoder_kl import AutoencoderKL
+from diffusers.models.unet_2d_condition import UNet2DConditionModel
+from diffusers import DiffusionPipeline,AudioPipelineOutput
+from transformers import CLIPTextModel, T5EncoderModel, AutoModel, T5Tokenizer, T5TokenizerFast
+from typing import Union
+from diffusers.utils.torch_utils import randn_tensor
+from tqdm import tqdm
+class Tango2Pipeline(DiffusionPipeline):
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: T5EncoderModel,
+        tokenizer: Union[T5Tokenizer, T5TokenizerFast],
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler
+    ):
+        super().__init__()
+        self.register_modules(vae=vae,
+        text_encoder=text_encoder,
+        tokenizer=tokenizer,
+        unet=unet,
+        scheduler=scheduler
+        )
+    def _encode_prompt(self, prompt):
+        device = self.text_encoder.device
+        batch = self.tokenizer(
+            prompt, max_length=self.tokenizer.model_max_length, padding=True, truncation=True, return_tensors="pt"
+        )
+        input_ids, attention_mask = batch.input_ids.to(device), batch.attention_mask.to(device)
+        encoder_hidden_states = self.text_encoder(
+                input_ids=input_ids, attention_mask=attention_mask
+            )[0]
+        boolean_encoder_mask = (attention_mask == 1).to(device)
+        return encoder_hidden_states, boolean_encoder_mask
+    def _encode_text_classifier_free(self, prompt, num_samples_per_prompt):
+        device = self.text_encoder.device
+        batch = self.tokenizer(
+            prompt, max_length=self.tokenizer.model_max_length, padding=True, truncation=True, return_tensors="pt"
+        )
+        input_ids, attention_mask = batch.input_ids.to(device), batch.attention_mask.to(device)
+        with torch.no_grad():
+            prompt_embeds = self.text_encoder(
+                input_ids=input_ids, attention_mask=attention_mask
+            )[0]
+        prompt_embeds = prompt_embeds.repeat_interleave(num_samples_per_prompt, 0)
+        attention_mask = attention_mask.repeat_interleave(num_samples_per_prompt, 0)
+        # get unconditional embeddings for classifier free guidance
+        uncond_tokens = [""] * len(prompt)
+        max_length = prompt_embeds.shape[1]
+        uncond_batch = self.tokenizer(
+            uncond_tokens, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt",
+        )
+        uncond_input_ids = uncond_batch.input_ids.to(device)
+        uncond_attention_mask = uncond_batch.attention_mask.to(device)
+        with torch.no_grad():
+            negative_prompt_embeds = self.text_encoder(
+                input_ids=uncond_input_ids, attention_mask=uncond_attention_mask
+            )[0]
+        negative_prompt_embeds = negative_prompt_embeds.repeat_interleave(num_samples_per_prompt, 0)
+        uncond_attention_mask = uncond_attention_mask.repeat_interleave(num_samples_per_prompt, 0)
+        # For classifier free guidance, we need to do two forward passes.
+        # We concatenate the unconditional and text embeddings into a single batch to avoid doing two forward passes
+        prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+        prompt_mask = torch.cat([uncond_attention_mask, attention_mask])
+        boolean_prompt_mask = (prompt_mask == 1).to(device)
+        return prompt_embeds, boolean_prompt_mask
+    def prepare_latents(self, batch_size, inference_scheduler, num_channels_latents, dtype, device):
+        shape = (batch_size, num_channels_latents, 256, 16)
+        latents = randn_tensor(shape, generator=None, device=device, dtype=dtype)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * inference_scheduler.init_noise_sigma
+        return latents
+    @torch.no_grad()
+    def inference(self, prompt, inference_scheduler, num_steps=20, guidance_scale=3, num_samples_per_prompt=1,
+                  disable_progress=True):
+        device = self.text_encoder.device
+        classifier_free_guidance = guidance_scale > 1.0
+        batch_size = len(prompt) * num_samples_per_prompt
+        if classifier_free_guidance:
+            prompt_embeds, boolean_prompt_mask = self._encode_text_classifier_free(prompt, num_samples_per_prompt)
+        else:
+            prompt_embeds, boolean_prompt_mask = self._encode_text(prompt)
+            prompt_embeds = prompt_embeds.repeat_interleave(num_samples_per_prompt, 0)
+            boolean_prompt_mask = boolean_prompt_mask.repeat_interleave(num_samples_per_prompt, 0)
+        inference_scheduler.set_timesteps(num_steps, device=device)
+        timesteps = inference_scheduler.timesteps
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(batch_size, inference_scheduler, num_channels_latents, prompt_embeds.dtype, device)
+        num_warmup_steps = len(timesteps) - num_steps * inference_scheduler.order
+        progress_bar = tqdm(range(num_steps), disable=disable_progress)
+        for i, t in enumerate(timesteps):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if classifier_free_guidance else latents
+            latent_model_input = inference_scheduler.scale_model_input(latent_model_input, t)
+            noise_pred = self.unet(
+                latent_model_input, t, encoder_hidden_states=prompt_embeds,
+                encoder_attention_mask=boolean_prompt_mask
+            ).sample
+            # perform guidance
+            if classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = inference_scheduler.step(noise_pred, t, latents).prev_sample
+            # call the callback, if provided
+            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % inference_scheduler.order == 0):
+                progress_bar.update(1)
+        return latents
+    @torch.no_grad()
+    def __call__(self, prompt, steps=100, guidance=3, samples=1, disable_progress=True):
+        """ Genrate audio for a single prompt string. """
+        with torch.no_grad():
+            latents = self.inference([prompt], self.scheduler, steps, guidance, samples, disable_progress=disable_progress)
+            mel = self.vae.decode_first_stage(latents)
+            wave = self.vae.decode_to_waveform(mel)
+        return AudioPipelineOutput(audios=wave)
 # Automatic device detection
 if torch.cuda.is_available():
     device_type = "cuda"
 # Initialize TANGO
 tango = Tango(device="cpu")
+pipe = Tango2Pipeline(vae=tango.vae,
+                      text_encoder=tango.model.text_encoder,
+                      tokenizer=tango.model.tokenizer,
+                      unet=tango.model.unet,
+                      scheduler=tango.scheduler
+                      )
+pipe.to(device)
+#tango.vae.to(device_type)
+#tango.stft.to(device_type)
+#tango.model.to(device_type)
 @spaces.GPU(duration=60)
 def gradio_generate(prompt, output_format, steps, guidance):
+    output_wave = pipe(prompt,steps,guidance) ## Using pipeliine automatically uses flash attention for torch2.0 above
+    #output_wave = tango.generate(prompt, steps, guidance)
     # output_filename = f"{prompt.replace(' ', '_')}_{steps}_{guidance}"[:250] + ".wav"
     output_filename = "temp.wav"
     wavio.write(output_filename, output_wave, rate=16000, sampwidth=2)