Music3 / app.py
DanLeBossDeESGI's picture
Update app.py
beb5f8e
raw
history blame
2.78 kB
import torch
import gradio as gr
from diffusers import AudioLDMPipeline
from transformers import AutoProcessor, ClapModel
# make Space compatible with CPU duplicates
if torch.cuda.is_available():
device = "cuda"
torch_dtype = torch.float16
else:
device = "cpu"
torch_dtype = torch.float32
# load the diffusers pipeline
repo_id = "cvssp/audioldm-m-full"
pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch_dtype).to(device)
pipe.unet = torch.compile(pipe.unet)
# CLAP model (only required for automatic scoring)
clap_model = ClapModel.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full").to(device)
processor = AutoProcessor.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full")
generator = torch.Generator(device)
def score_waveforms(text, waveforms):
inputs = processor(text=text, audios=list(waveforms), return_tensors="pt", padding=True)
inputs = {key: inputs[key].to(device) for key in inputs}
with torch.no_grad():
logits_per_text = clap_model(**inputs).logits_per_text # this is the audio-text similarity score
probs = logits_per_text.softmax(dim=-1) # we can take the softmax to get the label probabilities
most_probable = torch.argmax(probs) # and now select the most likely audio waveform
waveform = waveforms[most_probable]
return waveform
def text_to_music(text_input, negative_prompt, seed, duration, guidance_scale, n_candidates):
waveforms = pipe(
text_input,
audio_length_in_s=duration,
guidance_scale=guidance_scale,
num_inference_steps=100,
negative_prompt=negative_prompt,
num_waveforms_per_prompt=n_candidates if n_candidates else 1,
generator=generator.manual_seed(int(seed)),
)["audios"]
if waveforms.shape[0] > 1:
waveform = score_waveforms(text_input, waveforms)
else:
waveform = waveforms[0]
return waveform.detach().cpu().numpy()
iface = gr.Interface(
fn=text_to_music,
inputs=[
gr.inputs.Textbox(label="Input text", default="A hammer is hitting a wooden surface"),
gr.inputs.Textbox(label="Negative prompt", default="low quality, average quality"),
gr.inputs.Number(label="Seed", default=45),
gr.inputs.Slider(label="Duration (seconds)", minimum=2.5, maximum=10.0, default=5.0, step=0.1),
gr.inputs.Slider(label="Guidance scale", minimum=0.0, maximum=4.0, default=2.5, step=0.1),
gr.inputs.Slider(label="Number waveforms to generate", minimum=1, maximum=3, default=3, step=1),
],
outputs=gr.outputs.Audio(label="Generated Audio", type="numpy"),
live=True,
title="Text to Music",
description="Convert text into music using a pre-trained model.",
theme="default",
)
iface.launch()