Spaces:

haoheliu
/

audioldm-text-to-audio-generation

Sleeping

App Files Files Community

377

haoheliu commited on Feb 1, 2023

Commit

39711bd

•

1 Parent(s): 4eab478

try out UI design

Browse files

Files changed (4) hide show

app.py +53 -46
audioldm/latent_diffusion/ddim.py +3 -0
audioldm/ldm.py +5 -7
audioldm/pipeline.py +6 -3

app.py CHANGED Viewed

@@ -1,55 +1,62 @@
 import gradio as gr
 import numpy as np
-# from audioldm import text_to_audio
-def text2audio(text, length):
-    # waveform = text_to_audio(text, n_gen=1) # [bs, 1, samples]
-    # waveform = [(16000, wave[0]) for wave in waveform]
-    waveform = [(16000, np.random.randn(16000)), (16000, np.random.randn(16000))]
-    return waveform
-# iface = gr.Interface(fn=greet, inputs="text", outputs=["audio", "audio"])
-# iface.launch()
-block = gr.Blocks()
-with block:
-    gr.HTML(
-        """
-            <div style="text-align: center; max-width: 700px; margin: 0 auto;">
-              <div
-                style="
-                  display: inline-flex;
-                  align-items: center;
-                  gap: 0.8rem;
-                  font-size: 1.75rem;
-                "
-              >
-                <h1 style="font-weight: 900; margin-bottom: 7px;">
-                  Text-to-Audio Generation with AudioLDM
-                </h1>
-              </div>
-              <p style="margin-bottom: 10px; font-size: 94%">
-                <a href="https://arxiv.org/abs/2301.12503">[Paper]</a>  <a href="https://audioldm.github.io/">[Project page]</a>
-              </p>
-            </div>
-        """
-    )
-    with gr.Group():
-        with gr.Box():
-            textbox = gr.Textbox(value="A man is speaking in a huge room")
-            length = gr.Slider(1.0, 30.0, value=5.0, step=0.5, label="Audio length in seconds")
-            # model = gr.Dropdown(choices=["harmonai/maestro-150k"], value="harmonai/maestro-150k",type="value", label="Model")
-            out = [gr.Audio(label="Output", type="numpy"), gr.Audio(label="Output", type="numpy")]
-            btn = gr.Button("Submit").style(full_width=True)
-        btn.click(text2audio, inputs=[textbox, length], outputs=out)
-        gr.HTML('''
-        <div class="footer" style="text-align: center; max-width: 700px; margin: 0 auto;">
-                    <p>Model by <a href="https://haoheliu.github.io/" style="text-decoration: underline;" target="_blank">Haohe Liu</a>
-                    </p>
-        </div>
-        ''')
-block.launch(debug=True)

 import gradio as gr
 import numpy as np
+from audioldm import text_to_audio, seed_everything, build_model
+audioldm = build_model()
+def text2audio(text, duration, guidance_scale):
+    # print(text, length, guidance_scale)
+    waveform = text_to_audio(audioldm, text, duration=duration, guidance_scale=guidance_scale, n_candidate_gen_per_text=1) # [bs, 1, samples]
+    waveform = [(16000, wave[0]) for wave in waveform]
+    # waveform = [(16000, np.random.randn(16000)), (16000, np.random.randn(16000))]
+    return waveform
+iface = gr.Interface(fn=text2audio, inputs=[
+        gr.Textbox(value="A man is speaking in a huge room", max_lines=1),
+        gr.Slider(2, 15, value=5, step=0.1),
+        gr.Slider(0, 5, value=2.5, step=0.5),
+    ], outputs=[gr.Audio(label="Output", type="numpy"), gr.Audio(label="Output", type="numpy")]
+                     )
+iface.launch(share=True)
+# block = gr.Blocks()
+# with block:
+#     gr.HTML(
+#         """
+#             <div style="text-align: center; max-width: 700px; margin: 0 auto;">
+#               <div
+#                 style="
+#                   display: inline-flex;
+#                   align-items: center;
+#                   gap: 0.8rem;
+#                   font-size: 1.75rem;
+#                 "
+#               >
+#                 <h1 style="font-weight: 900; margin-bottom: 7px;">
+#                   Text-to-Audio Generation with AudioLDM
+#                 </h1>
+#               </div>
+#               <p style="margin-bottom: 10px; font-size: 94%">
+#                 <a href="https://arxiv.org/abs/2301.12503">[Paper]</a>  <a href="https://audioldm.github.io/">[Project page]</a>
+#               </p>
+#             </div>
+#         """
+#     )
+#     with gr.Group():
+#         with gr.Box():
+#             textbox = gr.Textbox(value="A man is speaking in a huge room")
+#             length = gr.Slider(1.0, 30.0, value=5.0, step=0.5, label="Audio length in seconds")
+#             # model = gr.Dropdown(choices=["harmonai/maestro-150k"], value="harmonai/maestro-150k",type="value", label="Model")
+#             out = [gr.Audio(label="Output", type="numpy"), gr.Audio(label="Output", type="numpy")]
+#             btn = gr.Button("Submit").style(full_width=True)
+#         btn.click(text2audio, inputs=[textbox, length], outputs=out)
+#         gr.HTML('''
+#         <div class="footer" style="text-align: center; max-width: 700px; margin: 0 auto;">
+#                     <p>Model by <a href="https://haoheliu.github.io/" style="text-decoration: underline;" target="_blank">Haohe Liu</a>
+#                     </p>
+#         </div>
+#         ''')
+# block.launch(debug=True)

audioldm/latent_diffusion/ddim.py CHANGED Viewed

@@ -10,6 +10,7 @@ from audioldm.latent_diffusion.util import (
     noise_like,
     extract_into_tensor,
 )
 class DDIMSampler(object):
     def __init__(self, model, schedule="linear", **kwargs):
@@ -200,6 +201,7 @@ class DDIMSampler(object):
         total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
         # print(f"Running DDIM Sampling with {total_steps} timesteps")
         iterator = tqdm(time_range, desc="DDIM Sampler", total=total_steps)
         for i, step in enumerate(iterator):
@@ -281,6 +283,7 @@ class DDIMSampler(object):
         total_steps = timesteps.shape[0]
         # print(f"Running DDIM Sampling with {total_steps} timesteps")
         iterator = tqdm(time_range, desc="Decoding image", total=total_steps)
         x_dec = x_latent

     noise_like,
     extract_into_tensor,
 )
+import gradio as gr
 class DDIMSampler(object):
     def __init__(self, model, schedule="linear", **kwargs):
         total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
         # print(f"Running DDIM Sampling with {total_steps} timesteps")
+        # iterator = gr.Progress().tqdm(time_range, desc="DDIM Sampler", total=total_steps)
         iterator = tqdm(time_range, desc="DDIM Sampler", total=total_steps)
         for i, step in enumerate(iterator):
         total_steps = timesteps.shape[0]
         # print(f"Running DDIM Sampling with {total_steps} timesteps")
+        # iterator = gr.Progress().tqdm(time_range, desc="Decoding image", total=total_steps)
         iterator = tqdm(time_range, desc="Decoding image", total=total_steps)
         x_dec = x_latent

audioldm/ldm.py CHANGED Viewed

@@ -636,7 +636,7 @@ class LatentDiffusion(DDPM):
         ddim_steps=200,
         ddim_eta=1.0,
         x_T=None,
-        n_gen=1,
         unconditional_guidance_scale=1.0,
         unconditional_conditioning=None,
         name="waveform",
@@ -644,7 +644,7 @@ class LatentDiffusion(DDPM):
         save=False,
         **kwargs,
     ):
-        # Generate n_gen times and select the best
         # Batch: audio, text, fnames
         assert x_T is None
         try:
@@ -672,17 +672,15 @@ class LatentDiffusion(DDPM):
                 text = super().get_input(batch, "text")
                 # Generate multiple samples
-                batch_size = z.shape[0] * n_gen
-                c = torch.cat([c] * n_gen, dim=0)
-                text = text * n_gen
                 if unconditional_guidance_scale != 1.0:
                     unconditional_conditioning = (
                         self.cond_stage_model.get_unconditional_condition(batch_size)
                     )
-                fnames = list(super().get_input(batch, "fname"))
                 samples, _ = self.sample_log(
                     cond=c,
                     batch_size=batch_size,

         ddim_steps=200,
         ddim_eta=1.0,
         x_T=None,
+        n_candidate_gen_per_text=1,
         unconditional_guidance_scale=1.0,
         unconditional_conditioning=None,
         name="waveform",
         save=False,
         **kwargs,
     ):
+        # Generate n_candidate_gen_per_text times and select the best
         # Batch: audio, text, fnames
         assert x_T is None
         try:
                 text = super().get_input(batch, "text")
                 # Generate multiple samples
+                batch_size = z.shape[0] * n_candidate_gen_per_text
+                c = torch.cat([c] * n_candidate_gen_per_text, dim=0)
+                text = text * n_candidate_gen_per_text
                 if unconditional_guidance_scale != 1.0:
                     unconditional_conditioning = (
                         self.cond_stage_model.get_unconditional_condition(batch_size)
                     )
                 samples, _ = self.sample_log(
                     cond=c,
                     batch_size=batch_size,

audioldm/pipeline.py CHANGED Viewed

@@ -29,7 +29,7 @@ def make_batch_for_text_to_audio(text, batchsize=2):
     )
     return batch
-def text_to_audio(text, batchsize=2, guidance_scale=2.5, n_gen=1, config=None):
     if(torch.cuda.is_available()):
         device = torch.device("cuda:0")
     else:
@@ -57,13 +57,16 @@ def text_to_audio(text, batchsize=2, guidance_scale=2.5, n_gen=1, config=None):
     latent_diffusion = latent_diffusion.to(device)
     latent_diffusion.cond_stage_model.embed_mode = "text"
-    batch = make_batch_for_text_to_audio(text, batchsize=batchsize)
     with torch.no_grad():
         waveform = latent_diffusion.generate_sample(
             [batch],
             unconditional_guidance_scale=guidance_scale,
-            n_gen=n_gen,
         )
     return waveform

     )
     return batch
+def build_model(config=None):
     if(torch.cuda.is_available()):
         device = torch.device("cuda:0")
     else:
     latent_diffusion = latent_diffusion.to(device)
     latent_diffusion.cond_stage_model.embed_mode = "text"
+    return latent_diffusion
+def text_to_audio(latent_diffusion, text, duration=10, batchsize=2, guidance_scale=2.5, n_candidate_gen_per_text=3, config=None):
+    batch = make_batch_for_text_to_audio(text, batchsize=batchsize)
     with torch.no_grad():
         waveform = latent_diffusion.generate_sample(
             [batch],
             unconditional_guidance_scale=guidance_scale,
+            n_candidate_gen_per_text=n_candidate_gen_per_text,
+            duration=duration
         )
     return waveform