import gradio as gr import torch from diffusers import AudioLDM2Pipeline # make Space compatible with CPU duplicates if torch.cuda.is_available(): device = "cuda" torch_dtype = torch.float16 else: device = "cpu" torch_dtype = torch.float32 # load the diffusers pipeline repo_id = "cvssp/audioldm2" pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch_dtype).to(device) # pipe.unet = torch.compile(pipe.unet) # set the generator for reproducibility generator = torch.Generator(device) def text2audio(text, negative_prompt, duration, guidance_scale, random_seed, n_candidates): if text is None: raise gr.Error("Please provide a text input.") # test values negative_prompt="Low quality." duration=10 guidance_scale=3.5 random_seed=45 n_candidates=3 waveforms = pipe( text, audio_length_in_s=duration, guidance_scale=guidance_scale, num_inference_steps=20, negative_prompt=negative_prompt, num_waveforms_per_prompt=n_candidates if n_candidates else 1, generator=generator.manual_seed(int(random_seed)), )["audios"] print(waveforms) return [gr.make_waveform((16000, waveforms[0])), gr.make_waveform((16000, waveforms[1])), gr.make_waveform((16000, waveforms[2]))] gradio_interface = gr.Interface( fn = text2audio, inputs = "text", outputs = ["audio", "audio", "audio"], ) gradio_interface.launch()