Spaces:

haoheliu
/

audioldm-text-to-audio-generation

Running on A10G

haoheliu commited on Feb 2, 2023

Commit

412929c

•

1 Parent(s): 39711bd

update code

Files changed (4) hide show

.gitignore CHANGED Viewed

@@ -1,3 +1,4 @@
 *.pyc
 __pycache__
-test.py

 *.pyc
 __pycache__
+test.py
+flagged

app.py CHANGED Viewed

@@ -4,17 +4,18 @@ from audioldm import text_to_audio, seed_everything, build_model
 audioldm = build_model()
-def text2audio(text, duration, guidance_scale):
     # print(text, length, guidance_scale)
-    waveform = text_to_audio(audioldm, text, duration=duration, guidance_scale=guidance_scale, n_candidate_gen_per_text=1) # [bs, 1, samples]
     waveform = [(16000, wave[0]) for wave in waveform]
     # waveform = [(16000, np.random.randn(16000)), (16000, np.random.randn(16000))]
     return waveform
 iface = gr.Interface(fn=text2audio, inputs=[
         gr.Textbox(value="A man is speaking in a huge room", max_lines=1),
-        gr.Slider(2, 15, value=5, step=0.1),
         gr.Slider(0, 5, value=2.5, step=0.5),
     ], outputs=[gr.Audio(label="Output", type="numpy"), gr.Audio(label="Output", type="numpy")]
                      )
 iface.launch(share=True)

 audioldm = build_model()
+def text2audio(text, duration, guidance_scale, random_seed):
     # print(text, length, guidance_scale)
+    waveform = text_to_audio(audioldm, text, random_seed, duration=duration, guidance_scale=guidance_scale, n_candidate_gen_per_text=1) # [bs, 1, samples]
     waveform = [(16000, wave[0]) for wave in waveform]
     # waveform = [(16000, np.random.randn(16000)), (16000, np.random.randn(16000))]
     return waveform
 iface = gr.Interface(fn=text2audio, inputs=[
         gr.Textbox(value="A man is speaking in a huge room", max_lines=1),
+        gr.Slider(2.5, 10, value=5, step=2.5),
         gr.Slider(0, 5, value=2.5, step=0.5),
+        gr.Number(value=42)
     ], outputs=[gr.Audio(label="Output", type="numpy"), gr.Audio(label="Output", type="numpy")]
                      )
 iface.launch(share=True)

audioldm/ldm.py CHANGED Viewed

@@ -659,7 +659,7 @@ class LatentDiffusion(DDPM):
         # os.makedirs(waveform_save_path, exist_ok=True)
         # print("Waveform save path: ", waveform_save_path)
-        with self.ema_scope("Plotting"):
             for batch in batchs:
                 z, c = self.get_input(
                     batch,

         # os.makedirs(waveform_save_path, exist_ok=True)
         # print("Waveform save path: ", waveform_save_path)
+        with self.ema_scope("Generate"):
             for batch in batchs:
                 z, c = self.get_input(
                     batch,

audioldm/pipeline.py CHANGED Viewed

@@ -6,9 +6,10 @@ import argparse
 import yaml
 import torch
-from audioldm import LatentDiffusion
 from audioldm.utils import default_audioldm_config
 import time
 def make_batch_for_text_to_audio(text, batchsize=2):
@@ -18,7 +19,7 @@ def make_batch_for_text_to_audio(text, batchsize=2):
     fbank = torch.zeros((batchsize, 1024, 64))  # Not used, here to keep the code format
     stft = torch.zeros((batchsize, 1024, 512))  # Not used
     waveform = torch.zeros((batchsize, 160000))  # Not used
-    fname = ["%s.wav" % x for x in range(batchsize)]
     batch = (
         fbank,
         stft,
@@ -59,9 +60,14 @@ def build_model(config=None):
     latent_diffusion.cond_stage_model.embed_mode = "text"
     return latent_diffusion
-def text_to_audio(latent_diffusion, text, duration=10, batchsize=2, guidance_scale=2.5, n_candidate_gen_per_text=3, config=None):
     batch = make_batch_for_text_to_audio(text, batchsize=batchsize)
     with torch.no_grad():
         waveform = latent_diffusion.generate_sample(
             [batch],

 import yaml
 import torch
+from audioldm import LatentDiffusion, seed_everything
 from audioldm.utils import default_audioldm_config
 import time
 def make_batch_for_text_to_audio(text, batchsize=2):
     fbank = torch.zeros((batchsize, 1024, 64))  # Not used, here to keep the code format
     stft = torch.zeros((batchsize, 1024, 512))  # Not used
     waveform = torch.zeros((batchsize, 160000))  # Not used
+    fname = [""] * batchsize # Not used
     batch = (
         fbank,
         stft,
     latent_diffusion.cond_stage_model.embed_mode = "text"
     return latent_diffusion
+def duration_to_latent_t_size(duration):
+    return int(duration * 25.6)
+def text_to_audio(latent_diffusion, text, seed=42, duration=10, batchsize=2, guidance_scale=2.5, n_candidate_gen_per_text=3, config=None):
+    seed_everything(int(seed))
     batch = make_batch_for_text_to_audio(text, batchsize=batchsize)
+    latent_diffusion.latent_t_size = duration_to_latent_t_size(duration)
     with torch.no_grad():
         waveform = latent_diffusion.generate_sample(
             [batch],