audioldm2_api / app.py
Johann Diedrick
Adjusted number of inference steps to 20
955b728
import gradio as gr
import torch
from diffusers import AudioLDM2Pipeline
# make Space compatible with CPU duplicates
if torch.cuda.is_available():
device = "cuda"
torch_dtype = torch.float16
else:
device = "cpu"
torch_dtype = torch.float32
# load the diffusers pipeline
repo_id = "cvssp/audioldm2"
pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch_dtype).to(device)
# pipe.unet = torch.compile(pipe.unet)
# set the generator for reproducibility
generator = torch.Generator(device)
def text2audio(text, negative_prompt, duration, guidance_scale, random_seed, n_candidates):
if text is None:
raise gr.Error("Please provide a text input.")
# test values
negative_prompt="Low quality."
duration=10
guidance_scale=3.5
random_seed=45
n_candidates=3
waveforms = pipe(
text,
audio_length_in_s=duration,
guidance_scale=guidance_scale,
num_inference_steps=20,
negative_prompt=negative_prompt,
num_waveforms_per_prompt=n_candidates if n_candidates else 1,
generator=generator.manual_seed(int(random_seed)),
)["audios"]
print(waveforms)
return [gr.make_waveform((16000, waveforms[0])), gr.make_waveform((16000, waveforms[1])), gr.make_waveform((16000, waveforms[2]))]
gradio_interface = gr.Interface(
fn = text2audio,
inputs = "text",
outputs = ["audio", "audio", "audio"],
)
gradio_interface.launch()