Spaces:

jytole
/

hftesting

Runtime error

App Files Files Community

jytole commited on Jun 20, 2023

Commit

4468558

•

1 Parent(s): 43c55ef

Re-included multiple candidates to improve quality

Browse files

Files changed (1) hide show

app.py +37 -9

app.py CHANGED Viewed

@@ -5,20 +5,27 @@ from diffusers import AudioLDMPipeline
 from transformers import AutoProcessor, ClapModel
-# replace with cuda code from AudioLDM's original app.py if using GPU
-device = "cpu"
-torch_dtype = torch.float32
 # load AudioLDM Diffuser Pipeline
 pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm-m-full", torch_dtype=torch_dtype).to(device)
 pipe.unet = torch.compile(pipe.unet)
-# omit CLAP model because we'll only generate one waveform, no scoring
 generator = torch.Generator(device)
-# modified from audioldm app.py to omit n_candidates
-def text2audio(text, negative_prompt, duration, guidance_scale, random_seed):
     if text is None:
         raise gr.Error("Please provide a text input.")
@@ -27,14 +34,27 @@ def text2audio(text, negative_prompt, duration, guidance_scale, random_seed):
         audio_length_in_s=duration,
         guidance_scale=guidance_scale,
         negative_prompt=negative_prompt,
-        num_waveforms_per_prompt=1,
         generator=generator.manual_seed(int(random_seed)),
     )["audios"]
-    waveform = waveforms[0]
     return gr.make_waveform((16000, waveform), bg_image="bg.png")
 # duplicate CSS config
 css = """
@@ -171,13 +191,21 @@ with iface:
                     label="Guidance scale",
                     info="Large => better quality and relevancy to text; Small => better diversity",
                 )
             outputs = gr.Video(label="Output", elem_id="output-video")
             btn = gr.Button("Submit").style(full_width=True)
         btn.click(
             text2audio,
-            inputs=[textbox, negative_textbox, duration, guidance_scale, seed],
             outputs=[outputs],
         )

 from transformers import AutoProcessor, ClapModel
+# cuda code from AudioLDM's original app.py if using GPU
+# allows support for CPU
+if torch.cuda.is_available():
+    device = "cuda"
+    torch_dtype = torch.float16
+else:
+    device = "cpu"
+    torch_dtype = torch.float32
 # load AudioLDM Diffuser Pipeline
 pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm-m-full", torch_dtype=torch_dtype).to(device)
 pipe.unet = torch.compile(pipe.unet)
+# include CLAP model because it improves quality
+clap_model = ClapModel.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full").to(device)
+processor = AutoProcessor.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full")
 generator = torch.Generator(device)
+# from audioldm app.py
+def text2audio(text, negative_prompt, duration, guidance_scale, random_seed, n_candidates):
     if text is None:
         raise gr.Error("Please provide a text input.")
         audio_length_in_s=duration,
         guidance_scale=guidance_scale,
         negative_prompt=negative_prompt,
+        num_waveforms_per_prompt=n_candidates if n_candidates else 1,
         generator=generator.manual_seed(int(random_seed)),
     )["audios"]
+    if waveforms.shape[0] > 1:
+        waveform = score_waveforms(text, waveforms)
+    else:
+        waveform = waveforms[0]
     return gr.make_waveform((16000, waveform), bg_image="bg.png")
+def score_waveforms(text, waveforms):
+    inputs = processor(text=text, audios=list(waveforms), return_tensors="pt", padding=True)
+    inputs = {key: inputs[key].to(device) for key in inputs}
+    with torch.no_grad():
+        logits_per_text = clap_model(**inputs).logits_per_text  # this is the audio-text similarity score
+        probs = logits_per_text.softmax(dim=-1)  # we can take the softmax to get the label probabilities
+        most_probable = torch.argmax(probs)  # and now select the most likely audio waveform
+    waveform = waveforms[most_probable]
+    return waveform
 # duplicate CSS config
 css = """
                     label="Guidance scale",
                     info="Large => better quality and relevancy to text; Small => better diversity",
                 )
+                n_candidates = gr.Slider(
+                    1,
+                    3,
+                    value=3,
+                    step=1,
+                    label="Number waveforms to generate",
+                    info="Automatic quality control. This number control the number of candidates (e.g., generate three audios and choose the best to show you). A Larger value usually lead to better quality with heavier computation",
+                )
             outputs = gr.Video(label="Output", elem_id="output-video")
             btn = gr.Button("Submit").style(full_width=True)
         btn.click(
             text2audio,
+            inputs=[textbox, negative_textbox, duration, guidance_scale, seed, n_candidates],
             outputs=[outputs],
         )