import gradio as gr import json import torch import time import random import spaces from tqdm import tqdm from huggingface_hub import snapshot_download from models import AudioDiffusion, DDPMScheduler from audioldm.audio.stft import TacotronSTFT from audioldm.variational_autoencoder import AutoencoderKL from pydub import AudioSegment max_64_bit_int = 2**63 - 1 # Automatic device detection if torch.cuda.is_available(): device_type = "cuda" device_selection = "cuda:0" else: device_type = "cpu" device_selection = "cpu" class Tango: def __init__(self, name = "declare-lab/tango2", device = device_selection): path = snapshot_download(repo_id = name) vae_config = json.load(open("{}/vae_config.json".format(path))) stft_config = json.load(open("{}/stft_config.json".format(path))) main_config = json.load(open("{}/main_config.json".format(path))) self.vae = AutoencoderKL(**vae_config).to(device) self.stft = TacotronSTFT(**stft_config).to(device) self.model = AudioDiffusion(**main_config).to(device) vae_weights = torch.load("{}/pytorch_model_vae.bin".format(path), map_location = device) stft_weights = torch.load("{}/pytorch_model_stft.bin".format(path), map_location = device) main_weights = torch.load("{}/pytorch_model_main.bin".format(path), map_location = device) self.vae.load_state_dict(vae_weights) self.stft.load_state_dict(stft_weights) self.model.load_state_dict(main_weights) print ("Successfully loaded checkpoint from:", name) self.vae.eval() self.stft.eval() self.model.eval() self.scheduler = DDPMScheduler.from_pretrained(main_config["scheduler_name"], subfolder = "scheduler") def chunks(self, lst, n): # Yield successive n-sized chunks from a list for i in range(0, len(lst), n): yield lst[i:i + n] def generate(self, prompt, steps = 100, guidance = 3, samples = 1, disable_progress = True): # Generate audio for a single prompt string with torch.no_grad(): latents = self.model.inference([prompt], self.scheduler, steps, guidance, samples, disable_progress = disable_progress) mel = self.vae.decode_first_stage(latents) wave = self.vae.decode_to_waveform(mel) return wave def generate_for_batch(self, prompts, steps = 200, guidance = 3, samples = 1, batch_size = 8, disable_progress = True): # Generate audio for a list of prompt strings outputs = [] for k in tqdm(range(0, len(prompts), batch_size)): batch = prompts[k: k + batch_size] with torch.no_grad(): latents = self.model.inference(batch, self.scheduler, steps, guidance, samples, disable_progress = disable_progress) mel = self.vae.decode_first_stage(latents) wave = self.vae.decode_to_waveform(mel) outputs += [item for item in wave] if samples == 1: return outputs return list(self.chunks(outputs, samples)) # Initialize TANGO tango = Tango(device = "cpu") tango.vae.to(device_type) tango.stft.to(device_type) tango.model.to(device_type) def update_seed(is_randomize_seed, seed): if is_randomize_seed: return random.randint(0, max_64_bit_int) return seed def check( prompt, output_number, steps, guidance, is_randomize_seed, seed ): if prompt is None or prompt == "": raise gr.Error("Please provide a prompt input.") if not output_number in [1, 2, 3]: raise gr.Error("Please ask for 1, 2 or 3 output files.") def update_output(output_format, output_number): return [ gr.update(format = output_format), gr.update(format = output_format, visible = (2 <= output_number)), gr.update(format = output_format, visible = (output_number == 3)), gr.update(visible = False) ] @spaces.GPU(duration=420) def text2audio( prompt, output_number, steps, guidance, is_randomize_seed, seed ): start = time.time() if seed is None: seed = random.randint(0, max_64_bit_int) random.seed(seed) torch.manual_seed(seed) output_wave = tango.generate(prompt, steps, guidance, output_number) output_wave_1 = gr.make_waveform((16000, output_wave[0])) output_wave_2 = gr.make_waveform((16000, output_wave[1])) if (2 <= output_number) else None output_wave_3 = gr.make_waveform((16000, output_wave[2])) if (output_number == 3) else None end = time.time() secondes = int(end - start) minutes = secondes // 60 secondes = secondes - (minutes * 60) hours = minutes // 60 minutes = minutes - (hours * 60) return [ output_wave_1, output_wave_2, output_wave_3, gr.update(visible = True, value = "Start again to get a different result. The output have been generated in " + ((str(hours) + " h, ") if hours != 0 else "") + ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + str(secondes) + " sec.") ] # Gradio interface with gr.Blocks() as interface: gr.Markdown("""

Text-to-Audio
Generates 10 seconds of sound effects from description, freely, without account, without watermark.



✨ Powered by Tango 2 AI.

""" + ("🏃‍♀️ Estimated time: few minutes." if torch.cuda.is_available() else "🐌 Slow process... ~5 min.") + """ Your computer must not enter into standby mode.
You can duplicate this space on a free account, it's designed to work on CPU, GPU and ZeroGPU.

⚖️ You can use, modify and share the generated sounds but not for commercial uses. """ ) input_text = gr.Textbox(label = "Prompt", value = "Snort of a horse", lines = 2, autofocus = True) with gr.Accordion("Advanced options", open = False): output_format = gr.Radio(label = "Output format", info = "The file you can dowload", choices = ["mp3", "wav"], value = "wav") output_number = gr.Slider(label = "Number of generations", info = "1, 2 or 3 output files", minimum = 1, maximum = 3, value = 1, step = 1, interactive = True) denoising_steps = gr.Slider(label = "Steps", info = "lower=faster & variant, higher=audio quality & similar", minimum = 10, maximum = 200, value = 10, step = 1, interactive = True) guidance_scale = gr.Slider(label = "Guidance Scale", info = "lower=audio quality, higher=follow the prompt", minimum = 1, maximum = 10, value = 3, step = 0.1, interactive = True) randomize_seed = gr.Checkbox(label = "\U0001F3B2 Randomize seed", value = True, info = "If checked, result is always different") seed = gr.Slider(minimum = 0, maximum = max_64_bit_int, step = 1, randomize = True, label = "Seed") submit = gr.Button("🚀 Generate", variant = "primary") output_audio_1 = gr.Audio(label = "Generated Audio #1/3", format = "wav", type="numpy", autoplay = True) output_audio_2 = gr.Audio(label = "Generated Audio #2/3", format = "wav", type="numpy") output_audio_3 = gr.Audio(label = "Generated Audio #3/3", format = "wav", type="numpy") information = gr.Label(label = "Information") submit.click(fn = update_seed, inputs = [ randomize_seed, seed ], outputs = [ seed ], queue = False, show_progress = False).then(fn = check, inputs = [ input_text, output_number, denoising_steps, guidance_scale, randomize_seed, seed ], outputs = [], queue = False, show_progress = False).success(fn = update_output, inputs = [ output_format, output_number ], outputs = [ output_audio_1, output_audio_2, output_audio_3, information ], queue = False, show_progress = False).success(fn = text2audio, inputs = [ input_text, output_number, denoising_steps, guidance_scale, randomize_seed, seed ], outputs = [ output_audio_1, output_audio_2, output_audio_3, information ], scroll_to_output = True) gr.Examples( fn = text2audio, inputs = [ input_text, output_number, denoising_steps, guidance_scale, randomize_seed, seed ], outputs = [ output_audio_1, output_audio_2, output_audio_3, information ], examples = [ ["A hammer is hitting a wooden surface", 3, 100, 3, False, 123], ["Peaceful and calming ambient music with singing bowl and other instruments.", 3, 100, 3, False, 123], ["A man is speaking in a small room.", 2, 100, 3, False, 123], ["A female is speaking followed by footstep sound", 1, 100, 3, False, 123], ["Wooden table tapping sound followed by water pouring sound.", 3, 200, 3, False, 123], ], cache_examples = "lazy", ) gr.Markdown( """ ## How to prompt your sound You can use round brackets to increase the importance of a part: ``` Peaceful and (calming) ambient music with singing bowl and other instruments ``` You can use several levels of round brackets to even more increase the importance of a part: ``` (Peaceful) and ((calming)) ambient music with singing bowl and other instruments ``` You can use number instead of several round brackets: ``` (Peaceful:1.5) and ((calming)) ambient music with singing bowl and other instruments ``` You can do the same thing with square brackets to decrease the importance of a part: ``` (Peaceful:1.5) and ((calming)) ambient music with [singing:2] bowl and other instruments """ ) interface.queue(10).launch()