xujunhao commited on
Commit
7d9b08c
β€’
1 Parent(s): 20780d8

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -0
app.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from diffusers import AudioLDMPipeline
4
+
5
+ from transformers import AutoProcessor, ClapModel
6
+
7
+
8
+ device = "mps"
9
+ torch_dtype = torch.float32
10
+
11
+ repo_id = "cvssp/audioldm-m-full"
12
+ pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch_dtype).to(device)
13
+ pipe.unet = torch.compile(pipe.unet)
14
+
15
+
16
+ clap_model = ClapModel.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full").to(device)
17
+ processor = AutoProcessor.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full")
18
+
19
+ generator = torch.Generator(device)
20
+
21
+
22
+ def text2audio(text, negative_prompt, duration, guidance_scale, random_seed, n_candidates):
23
+ if text is None:
24
+ raise gr.Error("θ―·ζδΎ›ζ–‡ζœ¬θΎ“ε…₯")
25
+
26
+ waveforms = pipe(
27
+ text,
28
+ audio_length_in_s=duration,
29
+ guidance_scale=guidance_scale,
30
+ negative_prompt=negative_prompt,
31
+ num_waveforms_per_prompt=n_candidates if n_candidates else 1,
32
+ generator=generator.manual_seed(int(random_seed)),
33
+ )["audios"]
34
+
35
+ if waveforms.shape[0] > 1:
36
+ waveform = score_waveforms(text, waveforms)
37
+ else:
38
+ waveform = waveforms[0]
39
+
40
+ return gr.make_waveform((16000, waveform))
41
+
42
+
43
+ def score_waveforms(text, waveforms):
44
+ inputs = processor(text=text, audios=list(waveforms), return_tensors="pt", padding=True)
45
+ inputs = {key: inputs[key].to(device) for key in inputs}
46
+ with torch.no_grad():
47
+ logits_per_text = clap_model(**inputs).logits_per_text
48
+ probs = logits_per_text.softmax(dim=-1)
49
+ most_probable = torch.argmax(probs)
50
+ waveform = waveforms[most_probable]
51
+ return waveform
52
+
53
+
54
+ iface = gr.Blocks()
55
+
56
+ with iface:
57
+ with gr.Group():
58
+ with gr.Box():
59
+ textbox = gr.Textbox(
60
+ max_lines=1,
61
+ label="要求",
62
+ info="要求",
63
+ elem_id="prompt-in",
64
+ )
65
+ negative_textbox = gr.Textbox(
66
+ max_lines=1,
67
+ label="ζ›΄θ―¦η»†ηš„θ¦ζ±‚",
68
+ info="ζ›΄θ―¦η»†ηš„θ¦ζ±‚",
69
+ elem_id="prompt-in",
70
+ )
71
+
72
+ with gr.Accordion("ε±•εΌ€ζ›΄ε€šι€‰ι‘Ή", open=False):
73
+ seed = gr.Number(
74
+ value=45,
75
+ label="种子",
76
+ info="δΈεŒη§ε­ζœ‰δΈεŒη»“ζžœ,η›ΈεŒη§ε­ζœ‰η›ΈεŒη»“ζžœ",
77
+ )
78
+ duration = gr.Slider(2.5, 10, value=5, step=2.5, label="ζŒη»­ζ—Άι—΄(η§’)")
79
+ guidance_scale = gr.Slider(
80
+ 0,
81
+ 4,
82
+ value=2.5,
83
+ step=0.5,
84
+ label="质量",
85
+ info="ε€§ζœ‰ζ›΄ε₯½ηš„θ΄¨ι‡ε’ŒδΈŽζ–‡ζœ¬ηš„η›Έε…³ζ€§οΌ›ε°ζœ‰ζ›΄ε₯½ηš„ε€šζ ·ζ€§",
86
+ )
87
+ n_candidates = gr.Slider(
88
+ 1,
89
+ 3,
90
+ value=3,
91
+ step=1,
92
+ label="候选数量",
93
+ info="θΏ™δΈͺζ•°ε­—ζŽ§εˆΆε€™ι€‰ζ•°ι‡",
94
+ )
95
+
96
+ outputs = gr.Video(label="Output", elem_id="output-video")
97
+ btn = gr.Button("Submit").style(full_width=True)
98
+
99
+ btn.click(
100
+ text2audio,
101
+ inputs=[textbox, negative_textbox, duration, guidance_scale, seed, n_candidates],
102
+ outputs=[outputs],
103
+ )
104
+
105
+ iface.queue(max_size=10).launch(debug=True)