audioldm-text-to-audio-generation

Runtime error

App Files Files Community

haoheliu commited on Feb 2, 2023

Commit

4e9d8a1

•

1 Parent(s): c55c219

two output to one output

Browse files

Files changed (5) hide show

app.py +8 -4
audioldm/clap/open_clip/model.py +2 -0
audioldm/ldm.py +13 -12
audioldm/pipeline.py +4 -4
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -30,7 +30,10 @@ def text2audio(text, duration, guidance_scale, random_seed, n_candidates):
     waveform = text_to_audio(audioldm, text, random_seed, duration=duration, guidance_scale=guidance_scale, n_candidate_gen_per_text=int(n_candidates)) # [bs, 1, samples]
     waveform = [(16000, wave[0]) for wave in waveform]
     # waveform = [(16000, np.random.randn(16000)), (16000, np.random.randn(16000))]
-    return waveform
 # iface = gr.Interface(fn=text2audio, inputs=[
 #         gr.Textbox(value="A man is speaking in a huge room", max_lines=1),
@@ -71,13 +74,14 @@ with iface:
             ############# Input
             textbox = gr.Textbox(value="A hammer is hitting a wooden surface", max_lines=1)
-            with gr.Accordion("Click to change detailed configurations", open=False):
               seed = gr.Number(value=42, label="Change this value (any integer number) will lead to a different generation result.")
               duration = gr.Slider(2.5, 10, value=5, step=2.5, label="Duration (seconds)")
               guidance_scale = gr.Slider(0, 5, value=2.5, step=0.5, label="Guidance scale (Large => better quality and relavancy to text; Small => better diversity)")
               n_candidates = gr.Slider(1, 5, value=3, step=1, label="Automatic quality control. This number control the number of candidates (e.g., generate three audios and choose the best to show you). A Larger value usually lead to better quality with heavier computation")
             ############# Output
-            outputs=[gr.Audio(label="Output", type="numpy"), gr.Audio(label="Output", type="numpy")]
             btn = gr.Button("Submit").style(full_width=True)
         btn.click(text2audio, inputs=[textbox, duration, guidance_scale, seed, n_candidates], outputs=outputs)
@@ -89,6 +93,6 @@ with iface:
         </div>
         ''')
-iface.queue(concurrency_count=2)
 iface.launch(debug=True)
 # iface.launch(debug=True, share=True)

     waveform = text_to_audio(audioldm, text, random_seed, duration=duration, guidance_scale=guidance_scale, n_candidate_gen_per_text=int(n_candidates)) # [bs, 1, samples]
     waveform = [(16000, wave[0]) for wave in waveform]
     # waveform = [(16000, np.random.randn(16000)), (16000, np.random.randn(16000))]
+    if(len(waveform) == 1):
+      return waveform[0]
+    else:
+      return waveform
 # iface = gr.Interface(fn=text2audio, inputs=[
 #         gr.Textbox(value="A man is speaking in a huge room", max_lines=1),
             ############# Input
             textbox = gr.Textbox(value="A hammer is hitting a wooden surface", max_lines=1)
+            with gr.Accordion("Click to modify detailed configurations", open=False):
               seed = gr.Number(value=42, label="Change this value (any integer number) will lead to a different generation result.")
               duration = gr.Slider(2.5, 10, value=5, step=2.5, label="Duration (seconds)")
               guidance_scale = gr.Slider(0, 5, value=2.5, step=0.5, label="Guidance scale (Large => better quality and relavancy to text; Small => better diversity)")
               n_candidates = gr.Slider(1, 5, value=3, step=1, label="Automatic quality control. This number control the number of candidates (e.g., generate three audios and choose the best to show you). A Larger value usually lead to better quality with heavier computation")
             ############# Output
+            outputs=[gr.Audio(label="Output", type="numpy")]
+            # outputs=[gr.Audio(label="Output", type="numpy"), gr.Audio(label="Output", type="numpy")]
             btn = gr.Button("Submit").style(full_width=True)
         btn.click(text2audio, inputs=[textbox, duration, guidance_scale, seed, n_candidates], outputs=outputs)
         </div>
         ''')
+iface.queue(concurrency_count = 2)
 iface.launch(debug=True)
 # iface.launch(debug=True, share=True)

audioldm/clap/open_clip/model.py CHANGED Viewed

@@ -745,6 +745,8 @@ class CLAP(nn.Module):
         device = next(self.parameters()).device
         for k in data:
             data[k] = data[k].to(device)
         text_embeds = self.encode_text(data, device=device)
         text_embeds = F.normalize(text_embeds, dim=-1)

         device = next(self.parameters()).device
         for k in data:
             data[k] = data[k].to(device)
+            if(len(data[k].size()) < 2):
+                data[k] = data[k].unsqueeze(0)
         text_embeds = self.encode_text(data, device=device)
         text_embeds = F.normalize(text_embeds, dim=-1)

audioldm/ldm.py CHANGED Viewed

@@ -697,18 +697,19 @@ class LatentDiffusion(DDPM):
                 waveform = self.mel_spectrogram_to_waveform(mel)
-                similarity = self.cond_stage_model.cos_similarity(
-                    torch.FloatTensor(waveform).squeeze(1), text
-                )
-                best_index = []
-                for i in range(z.shape[0]):
-                    candidates = similarity[i :: z.shape[0]]
-                    max_index = torch.argmax(candidates).item()
-                    best_index.append(i + max_index * z.shape[0])
-                waveform = waveform[best_index]
-                # print("Similarity between generated audio and text", similarity)
-                # print("Choose the following indexes:", best_index)
         return waveform

                 waveform = self.mel_spectrogram_to_waveform(mel)
+                if(waveform.shape[0] > 1):
+                    similarity = self.cond_stage_model.cos_similarity(
+                        torch.FloatTensor(waveform).squeeze(1), text
+                    )
+                    best_index = []
+                    for i in range(z.shape[0]):
+                        candidates = similarity[i :: z.shape[0]]
+                        max_index = torch.argmax(candidates).item()
+                        best_index.append(i + max_index * z.shape[0])
+                    waveform = waveform[best_index]
+                    # print("Similarity between generated audio and text", similarity)
+                    # print("Choose the following indexes:", best_index)
         return waveform

audioldm/pipeline.py CHANGED Viewed

@@ -12,10 +12,10 @@ from audioldm.utils import default_audioldm_config
 import time
-def make_batch_for_text_to_audio(text, batchsize=2):
     text = [text] * batchsize
-    if batchsize < 2:
-        print("Warning: Batchsize must be at least 2. Batchsize is set to 2.")
     fbank = torch.zeros((batchsize, 1024, 64))  # Not used, here to keep the code format
     stft = torch.zeros((batchsize, 1024, 512))  # Not used
     waveform = torch.zeros((batchsize, 160000))  # Not used
@@ -63,7 +63,7 @@ def build_model(config=None):
 def duration_to_latent_t_size(duration):
     return int(duration * 25.6)
-def text_to_audio(latent_diffusion, text, seed=42, duration=10, batchsize=2, guidance_scale=2.5, n_candidate_gen_per_text=3, config=None):
     seed_everything(int(seed))
     batch = make_batch_for_text_to_audio(text, batchsize=batchsize)

 import time
+def make_batch_for_text_to_audio(text, batchsize=1):
     text = [text] * batchsize
+    if batchsize < 1:
+        print("Warning: Batchsize must be at least 1. Batchsize is set to .")
     fbank = torch.zeros((batchsize, 1024, 64))  # Not used, here to keep the code format
     stft = torch.zeros((batchsize, 1024, 512))  # Not used
     waveform = torch.zeros((batchsize, 160000))  # Not used
 def duration_to_latent_t_size(duration):
     return int(duration * 25.6)
+def text_to_audio(latent_diffusion, text, seed=42, duration=10, batchsize=1, guidance_scale=2.5, n_candidate_gen_per_text=3, config=None):
     seed_everything(int(seed))
     batch = make_batch_for_text_to_audio(text, batchsize=batchsize)

requirements.txt CHANGED Viewed

@@ -11,6 +11,7 @@ numpy<=1.23.5
 soundfile
 librosa
 pandas
 torchlibrosa
 transformers
 ftfy

 soundfile
 librosa
 pandas
+# transformers
 torchlibrosa
 transformers
 ftfy