File size: 2,222 Bytes
cb8ade1
 
a642956
6514433
 
 
7023739
 
cb8ade1
9b18d73
 
7023739
 
ca5e437
9b18d73
7023739
e8233e7
 
 
9b18d73
52a5324
3e59502
876cdf7
 
 
e8233e7
876cdf7
21e6702
c647bb3
ef79c00
876cdf7
 
7fcf913
c647bb3
7023739
 
 
e8233e7
4de77e2
e8233e7
 
 
 
 
 
 
 
 
 
 
 
 
 
7023739
3e59502
cb8ade1
837762b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import gradio as gr

from diffusers import AudioLDMPipeline, DPMSolverMultistepScheduler

from transformers import AutoProcessor, ClapModel

import torch
# import scipy

device="cpu"

repo_id = "cvssp/audioldm-s-full-v2"
pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch.float32)
#pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe = pipe.to(device)

clap_model = ClapModel.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full").to(device)
processor = AutoProcessor.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full")

generator = torch.Generator(device)

def texttoaudio(prompt, neg_prompt, seed, inf_steps, guidance_scale, n_candidates):
    if prompt is None:
        raise gr.Error("Please provide a text input.")
        
    waveforms = pipe(
                 prompt, 
                 negative_prompt=neg_prompt,
                 num_inference_steps=int(inf_steps),
                 guidance_scale=guidance_scale,
                 audio_length_in_s=5.0,
                 generator=generator.manual_seed(int(seed)),
                 num_waveforms_per_prompt=int(n_candidates) if n_candidates else 1,
                )["audios"]

    # save the audio sample as a .wav file
    # scipy.io.wavfile.write("output.wav", rate=16000, data=audio)
    if waveforms.shape[0] > 1:
        waveform = score_waveforms(prompt, waveforms)
    else:
        waveform = waveforms[0]

    return (16000, waveform)

def score_waveforms(text, waveforms):
    inputs = processor(text=text, audios=list(waveforms), return_tensors="pt", padding=True)
    inputs = {key: inputs[key].to(device) for key in inputs}
    with torch.no_grad():
        logits_per_text = clap_model(**inputs).logits_per_text  # this is the audio-text similarity score
        probs = logits_per_text.softmax(dim=-1)  # we can take the softmax to get the label probabilities
        most_probable = torch.argmax(probs)  # and now select the most likely audio waveform
    waveform = waveforms[most_probable]
    return waveform

iface = gr.Interface(fn=texttoaudio, title="AudioLDM Testing Playground", inputs=["text", "text", "number", "number", "number", "number"], outputs="audio")

iface.launch()