import gradio as gr from diffusers import AudioLDMPipeline # The recommended "fast" scheduler: #from diffusers import DPMSolverMultistepScheduler # The Default AudioLDM scheduler: #from diffusers import DDIMScheduler #from diffusers import DDPMScheduler #from diffusers import DEISMultistepScheduler #from diffusers import DPMSolverSinglestepScheduler #from diffusers import HeunDiscreteScheduler from diffusers import KDPM2DiscreteScheduler #from diffusers import KDPM2AncestralDiscreteScheduler #from diffusers import LMSDiscreteScheduler #from diffusers import PNDMScheduler #from diffusers import EulerDiscreteScheduler #from diffusers import EulerAncestralDiscreteScheduler #from diffusers import UniPCMultistepScheduler from transformers import AutoProcessor, ClapModel import torch # import scipy device="cpu" repo_id = "cvssp/audioldm-s-full-v2" pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch.float32) #pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) #pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) #pipe.scheduler = DDPMScheduler.from_config(pipe.scheduler.config) #pipe.scheduler = DEISMultistepScheduler.from_config(pipe.scheduler.config) #pipe.scheduler = DPMSolverSinglestepScheduler.from_config(pipe.scheduler.config) #pipe.scheduler = HeunDiscreteScheduler.from_config(pipe.scheduler.config) pipe.scheduler = KDPM2DiscreteScheduler.from_config(pipe.scheduler.config) #pipe.scheduler = KDPM2AncestralDiscreteScheduler.from_config(pipe.scheduler.config) #pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config) #pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config) #pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config) #pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config) #pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config) pipe = pipe.to(device) clap_model = ClapModel.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full").to(device) processor = AutoProcessor.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full", sampling_rate=16000) generator = torch.Generator(device) def texttoaudio(prompt, neg_prompt, seed, inf_steps, guidance_scale, n_candidates): if prompt is None: raise gr.Error("Please provide a text input.") waveforms = pipe( prompt, negative_prompt=neg_prompt, num_inference_steps=int(inf_steps), guidance_scale=guidance_scale, audio_length_in_s=5.0, generator=generator.manual_seed(int(seed)), num_waveforms_per_prompt=int(n_candidates) if n_candidates else 1, )["audios"] # save the audio sample as a .wav file # scipy.io.wavfile.write("output.wav", rate=16000, data=audio) if waveforms.shape[0] > 1: waveform = score_waveforms(prompt, waveforms) else: waveform = waveforms[0] return (16000, waveform) def score_waveforms(text, waveforms): inputs = processor(text=text, audios=list(waveforms), return_tensors="pt", padding=True, sampling_rate=16000) inputs = {key: inputs[key].to(device) for key in inputs} with torch.no_grad(): logits_per_text = clap_model(**inputs).logits_per_text # this is the audio-text similarity score probs = logits_per_text.softmax(dim=-1) # we can take the softmax to get the label probabilities most_probable = torch.argmax(probs) # and now select the most likely audio waveform waveform = waveforms[most_probable] return waveform iface = gr.Interface(fn=texttoaudio, title="AudioLDM Testing Playground", inputs=["text", "text", "number", "number", "number", "number"], outputs="audio") iface.launch()