hftesting / app.py
jytole's picture
Change to KDPM2Discrete
7c47766
import gradio as gr
from diffusers import AudioLDMPipeline
# The recommended "fast" scheduler:
#from diffusers import DPMSolverMultistepScheduler
# The Default AudioLDM scheduler:
#from diffusers import DDIMScheduler
#from diffusers import DDPMScheduler
#from diffusers import DEISMultistepScheduler
#from diffusers import DPMSolverSinglestepScheduler
#from diffusers import HeunDiscreteScheduler
from diffusers import KDPM2DiscreteScheduler
#from diffusers import KDPM2AncestralDiscreteScheduler
#from diffusers import LMSDiscreteScheduler
#from diffusers import PNDMScheduler
#from diffusers import EulerDiscreteScheduler
#from diffusers import EulerAncestralDiscreteScheduler
#from diffusers import UniPCMultistepScheduler
from transformers import AutoProcessor, ClapModel
import torch
# import scipy
device="cpu"
repo_id = "cvssp/audioldm-s-full-v2"
pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch.float32)
#pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
#pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
#pipe.scheduler = DDPMScheduler.from_config(pipe.scheduler.config)
#pipe.scheduler = DEISMultistepScheduler.from_config(pipe.scheduler.config)
#pipe.scheduler = DPMSolverSinglestepScheduler.from_config(pipe.scheduler.config)
#pipe.scheduler = HeunDiscreteScheduler.from_config(pipe.scheduler.config)
pipe.scheduler = KDPM2DiscreteScheduler.from_config(pipe.scheduler.config)
#pipe.scheduler = KDPM2AncestralDiscreteScheduler.from_config(pipe.scheduler.config)
#pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
#pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config)
#pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
#pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
#pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
pipe = pipe.to(device)
clap_model = ClapModel.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full").to(device)
processor = AutoProcessor.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full", sampling_rate=16000)
generator = torch.Generator(device)
def texttoaudio(prompt, neg_prompt, seed, inf_steps, guidance_scale, n_candidates):
if prompt is None:
raise gr.Error("Please provide a text input.")
waveforms = pipe(
prompt,
negative_prompt=neg_prompt,
num_inference_steps=int(inf_steps),
guidance_scale=guidance_scale,
audio_length_in_s=5.0,
generator=generator.manual_seed(int(seed)),
num_waveforms_per_prompt=int(n_candidates) if n_candidates else 1,
)["audios"]
# save the audio sample as a .wav file
# scipy.io.wavfile.write("output.wav", rate=16000, data=audio)
if waveforms.shape[0] > 1:
waveform = score_waveforms(prompt, waveforms)
else:
waveform = waveforms[0]
return (16000, waveform)
def score_waveforms(text, waveforms):
inputs = processor(text=text, audios=list(waveforms), return_tensors="pt", padding=True, sampling_rate=16000)
inputs = {key: inputs[key].to(device) for key in inputs}
with torch.no_grad():
logits_per_text = clap_model(**inputs).logits_per_text # this is the audio-text similarity score
probs = logits_per_text.softmax(dim=-1) # we can take the softmax to get the label probabilities
most_probable = torch.argmax(probs) # and now select the most likely audio waveform
waveform = waveforms[most_probable]
return waveform
iface = gr.Interface(fn=texttoaudio, title="AudioLDM Testing Playground", inputs=["text", "text", "number", "number", "number", "number"], outputs="audio")
iface.launch()