import gradio as gr import json import torch import wavio from tqdm import tqdm from huggingface_hub import snapshot_download from models import AudioDiffusion, DDPMScheduler from audioldm.audio.stft import TacotronSTFT from audioldm.variational_autoencoder import AutoencoderKL from pydub import AudioSegment from gradio import Markdown import spaces # Automatic device detection if torch.cuda.is_available(): device_type = "cuda" device_selection = "cuda:0" else: device_type = "cpu" device_selection = "cpu" class Tango: def __init__(self, name="declare-lab/tango2-full", device=device_selection): path = snapshot_download(repo_id=name) vae_config = json.load(open("{}/vae_config.json".format(path))) stft_config = json.load(open("{}/stft_config.json".format(path))) main_config = json.load(open("{}/main_config.json".format(path))) self.vae = AutoencoderKL(**vae_config).to(device) self.stft = TacotronSTFT(**stft_config).to(device) self.model = AudioDiffusion(**main_config).to(device) vae_weights = torch.load("{}/pytorch_model_vae.bin".format(path), map_location=device) stft_weights = torch.load("{}/pytorch_model_stft.bin".format(path), map_location=device) main_weights = torch.load("{}/pytorch_model_main.bin".format(path), map_location=device) self.vae.load_state_dict(vae_weights) self.stft.load_state_dict(stft_weights) self.model.load_state_dict(main_weights) print ("Successfully loaded checkpoint from:", name) self.vae.eval() self.stft.eval() self.model.eval() self.scheduler = DDPMScheduler.from_pretrained(main_config["scheduler_name"], subfolder="scheduler") def chunks(self, lst, n): # Yield successive n-sized chunks from a list for i in range(0, len(lst), n): yield lst[i:i + n] def generate(self, prompt, steps=100, guidance=3, samples=3, disable_progress=True): # Genrate audio for a single prompt string with torch.no_grad(): latents = self.model.inference([prompt], self.scheduler, steps, guidance, samples, disable_progress=disable_progress) mel = self.vae.decode_first_stage(latents) wave = self.vae.decode_to_waveform(mel) return wave def generate_for_batch(self, prompts, steps=200, guidance=3, samples=1, batch_size=8, disable_progress=True): # Genrate audio for a list of prompt strings outputs = [] for k in tqdm(range(0, len(prompts), batch_size)): batch = prompts[k: k+batch_size] with torch.no_grad(): latents = self.model.inference(batch, self.scheduler, steps, guidance, samples, disable_progress=disable_progress) mel = self.vae.decode_first_stage(latents) wave = self.vae.decode_to_waveform(mel) outputs += [item for item in wave] if samples == 1: return outputs else: return list(self.chunks(outputs, samples)) # Initialize TANGO tango = Tango(device="cpu") tango.vae.to(device_type) tango.stft.to(device_type) tango.model.to(device_type) @spaces.GPU(duration=120) def gradio_generate( prompt, output_format, output_number, steps, guidance ): output_wave = tango.generate(prompt, steps, guidance, output_number) # output_filename = f"{prompt.replace(' ', '_')}_{steps}_{guidance}"[:250] + ".wav" output_filename_1 = "tmp1.wav" wavio.write(output_filename_1, output_wave[0], rate = 16000, sampwidth = 2) if (output_format == "mp3"): AudioSegment.from_wav("tmp1.wav").export("tmp1.mp3", format = "mp3") output_filename_1 = "tmp1.mp3" if (2 <= output_number): output_filename_2 = "tmp2.wav" wavio.write(output_filename_2, output_wave[1], rate = 16000, sampwidth = 2) if (output_format == "mp3"): AudioSegment.from_wav("tmp2.wav").export("tmp2.mp3", format = "mp3") output_filename_2 = "tmp2.mp3" else: output_filename_2 = None if (output_number == 3): output_filename_3 = "tmp3.wav" wavio.write(output_filename_3, output_wave[2], rate = 16000, sampwidth = 2) if (output_format == "mp3"): AudioSegment.from_wav("tmp3.wav").export("tmp3.mp3", format = "mp3") output_filename_3 = "tmp3.mp3" else: output_filename_3 = None return [output_filename_1, output_filename_2, output_filename_3] # description_text = """ #
For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings.
# Generate audio using TANGO by providing a text prompt.
#
Limitations: TANGO is trained on the small AudioCaps dataset so it may not generate good audio \
# samples related to concepts that it has not seen in training (e.g. singing). For the same reason, TANGO \
# is not always able to finely control its generations over textual control prompts. For example, \
# the generations from TANGO for prompts Chopping tomatoes on a wooden table and Chopping potatoes \
# on a metal table are very similar. \
#
We are currently training another version of TANGO on larger datasets to enhance its generalization, \
# compositional and controllable generation ability.
#
We recommend using a guidance scale of 3. The default number of steps is set to 100. More steps generally lead to better quality of generated audios but will take longer.
#
#
For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings.
Generate audio using Tango2 by providing a text prompt. Tango2 was built from Tango and was trained on Audio-alpaca
This is the demo for Tango2 for text to audio generation: Read our paper.