import gradio as gr import json import torch import wavio from tqdm import tqdm from huggingface_hub import snapshot_download from models import AudioDiffusion, DDPMScheduler from audioldm.audio.stft import TacotronSTFT from audioldm.variational_autoencoder import AutoencoderKL from gradio import Markdown import spaces # Automatic device detection if torch.cuda.is_available(): device_type = "cuda" device_selection = "cuda:0" else: device_type = "cpu" device_selection = "cpu" class Tango: def __init__(self, name="declare-lab/tango", device=device_selection): path = snapshot_download(repo_id=name) vae_config = json.load(open("{}/vae_config.json".format(path))) stft_config = json.load(open("{}/stft_config.json".format(path))) main_config = json.load(open("{}/main_config.json".format(path))) self.vae = AutoencoderKL(**vae_config).to(device) self.stft = TacotronSTFT(**stft_config).to(device) self.model = AudioDiffusion(**main_config).to(device) vae_weights = torch.load("{}/pytorch_model_vae.bin".format(path), map_location=device) stft_weights = torch.load("{}/pytorch_model_stft.bin".format(path), map_location=device) main_weights = torch.load("{}/pytorch_model_main.bin".format(path), map_location=device) self.vae.load_state_dict(vae_weights) self.stft.load_state_dict(stft_weights) self.model.load_state_dict(main_weights) print ("Successfully loaded checkpoint from:", name) self.vae.eval() self.stft.eval() self.model.eval() self.scheduler = DDPMScheduler.from_pretrained(main_config["scheduler_name"], subfolder="scheduler") def chunks(self, lst, n): """ Yield successive n-sized chunks from a list. """ for i in range(0, len(lst), n): yield lst[i:i + n] def generate(self, prompt, steps=100, guidance=3, samples=1, disable_progress=True): """ Genrate audio for a single prompt string. """ with torch.no_grad(): latents = self.model.inference([prompt], self.scheduler, steps, guidance, samples, disable_progress=disable_progress) mel = self.vae.decode_first_stage(latents) wave = self.vae.decode_to_waveform(mel) return wave[0] def generate_for_batch(self, prompts, steps=200, guidance=3, samples=1, batch_size=8, disable_progress=True): """ Genrate audio for a list of prompt strings. """ outputs = [] for k in tqdm(range(0, len(prompts), batch_size)): batch = prompts[k: k+batch_size] with torch.no_grad(): latents = self.model.inference(batch, self.scheduler, steps, guidance, samples, disable_progress=disable_progress) mel = self.vae.decode_first_stage(latents) wave = self.vae.decode_to_waveform(mel) outputs += [item for item in wave] if samples == 1: return outputs else: return list(self.chunks(outputs, samples)) # Initialize TANGO tango = Tango(device="cpu") tango.vae.to(device_type) tango.stft.to(device_type) tango.model.to(device_type) @spaces.GPU(duration=60) def gradio_generate(prompt, steps, guidance): output_wave = tango.generate(prompt, steps, guidance) # output_filename = f"{prompt.replace(' ', '_')}_{steps}_{guidance}"[:250] + ".wav" output_filename = "temp.wav" wavio.write(output_filename, output_wave, rate=16000, sampwidth=2) return output_filename description_text = """
For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings.
Generate audio using TANGO by providing a text prompt.
As TANGO consists of an instruction-tuned LLM, it is able to process complex sound descriptions allowing us to provide more detailed instructions to improve the generation quality.
For example, ``A boat is moving on the sea'' vs ``The sound of the water lapping against the hull of the boat or splashing as you move through the waves''. The latter is obtained by prompting ChatGPT to explain the sound generated when a boat moves on the sea.
Using this ChatGPT-generated description of the sound, TANGO provides superior results.
For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings.
# Generate audio using Tango2 by providing a text prompt. Tango2 was built from Tango and was trained on Audio-alpaca
#
This is the demo for Tango2 for text to audio generation: Read our paper.
#