haoheliu commited on
Commit
39711bd
1 Parent(s): 4eab478

try out UI design

Browse files
app.py CHANGED
@@ -1,55 +1,62 @@
1
  import gradio as gr
2
  import numpy as np
3
- # from audioldm import text_to_audio
4
 
5
- def text2audio(text, length):
6
- # waveform = text_to_audio(text, n_gen=1) # [bs, 1, samples]
7
- # waveform = [(16000, wave[0]) for wave in waveform]
8
- waveform = [(16000, np.random.randn(16000)), (16000, np.random.randn(16000))]
9
- return waveform
10
 
11
- # iface = gr.Interface(fn=greet, inputs="text", outputs=["audio", "audio"])
12
- # iface.launch()
 
 
 
 
13
 
 
 
 
 
 
 
 
14
 
15
- block = gr.Blocks()
16
 
17
- with block:
18
- gr.HTML(
19
- """
20
- <div style="text-align: center; max-width: 700px; margin: 0 auto;">
21
- <div
22
- style="
23
- display: inline-flex;
24
- align-items: center;
25
- gap: 0.8rem;
26
- font-size: 1.75rem;
27
- "
28
- >
29
- <h1 style="font-weight: 900; margin-bottom: 7px;">
30
- Text-to-Audio Generation with AudioLDM
31
- </h1>
32
- </div>
33
- <p style="margin-bottom: 10px; font-size: 94%">
34
- <a href="https://arxiv.org/abs/2301.12503">[Paper]</a> <a href="https://audioldm.github.io/">[Project page]</a>
35
- </p>
36
- </div>
37
- """
38
- )
39
- with gr.Group():
40
- with gr.Box():
41
- textbox = gr.Textbox(value="A man is speaking in a huge room")
42
- length = gr.Slider(1.0, 30.0, value=5.0, step=0.5, label="Audio length in seconds")
43
- # model = gr.Dropdown(choices=["harmonai/maestro-150k"], value="harmonai/maestro-150k",type="value", label="Model")
44
- out = [gr.Audio(label="Output", type="numpy"), gr.Audio(label="Output", type="numpy")]
45
- btn = gr.Button("Submit").style(full_width=True)
46
 
47
- btn.click(text2audio, inputs=[textbox, length], outputs=out)
48
- gr.HTML('''
49
- <div class="footer" style="text-align: center; max-width: 700px; margin: 0 auto;">
50
- <p>Model by <a href="https://haoheliu.github.io/" style="text-decoration: underline;" target="_blank">Haohe Liu</a>
51
- </p>
52
- </div>
53
- ''')
54
 
55
- block.launch(debug=True)
 
1
  import gradio as gr
2
  import numpy as np
3
+ from audioldm import text_to_audio, seed_everything, build_model
4
 
5
+ audioldm = build_model()
 
 
 
 
6
 
7
+ def text2audio(text, duration, guidance_scale):
8
+ # print(text, length, guidance_scale)
9
+ waveform = text_to_audio(audioldm, text, duration=duration, guidance_scale=guidance_scale, n_candidate_gen_per_text=1) # [bs, 1, samples]
10
+ waveform = [(16000, wave[0]) for wave in waveform]
11
+ # waveform = [(16000, np.random.randn(16000)), (16000, np.random.randn(16000))]
12
+ return waveform
13
 
14
+ iface = gr.Interface(fn=text2audio, inputs=[
15
+ gr.Textbox(value="A man is speaking in a huge room", max_lines=1),
16
+ gr.Slider(2, 15, value=5, step=0.1),
17
+ gr.Slider(0, 5, value=2.5, step=0.5),
18
+ ], outputs=[gr.Audio(label="Output", type="numpy"), gr.Audio(label="Output", type="numpy")]
19
+ )
20
+ iface.launch(share=True)
21
 
22
+ # block = gr.Blocks()
23
 
24
+ # with block:
25
+ # gr.HTML(
26
+ # """
27
+ # <div style="text-align: center; max-width: 700px; margin: 0 auto;">
28
+ # <div
29
+ # style="
30
+ # display: inline-flex;
31
+ # align-items: center;
32
+ # gap: 0.8rem;
33
+ # font-size: 1.75rem;
34
+ # "
35
+ # >
36
+ # <h1 style="font-weight: 900; margin-bottom: 7px;">
37
+ # Text-to-Audio Generation with AudioLDM
38
+ # </h1>
39
+ # </div>
40
+ # <p style="margin-bottom: 10px; font-size: 94%">
41
+ # <a href="https://arxiv.org/abs/2301.12503">[Paper]</a> <a href="https://audioldm.github.io/">[Project page]</a>
42
+ # </p>
43
+ # </div>
44
+ # """
45
+ # )
46
+ # with gr.Group():
47
+ # with gr.Box():
48
+ # textbox = gr.Textbox(value="A man is speaking in a huge room")
49
+ # length = gr.Slider(1.0, 30.0, value=5.0, step=0.5, label="Audio length in seconds")
50
+ # # model = gr.Dropdown(choices=["harmonai/maestro-150k"], value="harmonai/maestro-150k",type="value", label="Model")
51
+ # out = [gr.Audio(label="Output", type="numpy"), gr.Audio(label="Output", type="numpy")]
52
+ # btn = gr.Button("Submit").style(full_width=True)
53
 
54
+ # btn.click(text2audio, inputs=[textbox, length], outputs=out)
55
+ # gr.HTML('''
56
+ # <div class="footer" style="text-align: center; max-width: 700px; margin: 0 auto;">
57
+ # <p>Model by <a href="https://haoheliu.github.io/" style="text-decoration: underline;" target="_blank">Haohe Liu</a>
58
+ # </p>
59
+ # </div>
60
+ # ''')
61
 
62
+ # block.launch(debug=True)
audioldm/latent_diffusion/ddim.py CHANGED
@@ -10,6 +10,7 @@ from audioldm.latent_diffusion.util import (
10
  noise_like,
11
  extract_into_tensor,
12
  )
 
13
 
14
  class DDIMSampler(object):
15
  def __init__(self, model, schedule="linear", **kwargs):
@@ -200,6 +201,7 @@ class DDIMSampler(object):
200
  total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
201
  # print(f"Running DDIM Sampling with {total_steps} timesteps")
202
 
 
203
  iterator = tqdm(time_range, desc="DDIM Sampler", total=total_steps)
204
 
205
  for i, step in enumerate(iterator):
@@ -281,6 +283,7 @@ class DDIMSampler(object):
281
  total_steps = timesteps.shape[0]
282
  # print(f"Running DDIM Sampling with {total_steps} timesteps")
283
 
 
284
  iterator = tqdm(time_range, desc="Decoding image", total=total_steps)
285
  x_dec = x_latent
286
 
 
10
  noise_like,
11
  extract_into_tensor,
12
  )
13
+ import gradio as gr
14
 
15
  class DDIMSampler(object):
16
  def __init__(self, model, schedule="linear", **kwargs):
 
201
  total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
202
  # print(f"Running DDIM Sampling with {total_steps} timesteps")
203
 
204
+ # iterator = gr.Progress().tqdm(time_range, desc="DDIM Sampler", total=total_steps)
205
  iterator = tqdm(time_range, desc="DDIM Sampler", total=total_steps)
206
 
207
  for i, step in enumerate(iterator):
 
283
  total_steps = timesteps.shape[0]
284
  # print(f"Running DDIM Sampling with {total_steps} timesteps")
285
 
286
+ # iterator = gr.Progress().tqdm(time_range, desc="Decoding image", total=total_steps)
287
  iterator = tqdm(time_range, desc="Decoding image", total=total_steps)
288
  x_dec = x_latent
289
 
audioldm/ldm.py CHANGED
@@ -636,7 +636,7 @@ class LatentDiffusion(DDPM):
636
  ddim_steps=200,
637
  ddim_eta=1.0,
638
  x_T=None,
639
- n_gen=1,
640
  unconditional_guidance_scale=1.0,
641
  unconditional_conditioning=None,
642
  name="waveform",
@@ -644,7 +644,7 @@ class LatentDiffusion(DDPM):
644
  save=False,
645
  **kwargs,
646
  ):
647
- # Generate n_gen times and select the best
648
  # Batch: audio, text, fnames
649
  assert x_T is None
650
  try:
@@ -672,17 +672,15 @@ class LatentDiffusion(DDPM):
672
  text = super().get_input(batch, "text")
673
 
674
  # Generate multiple samples
675
- batch_size = z.shape[0] * n_gen
676
- c = torch.cat([c] * n_gen, dim=0)
677
- text = text * n_gen
678
 
679
  if unconditional_guidance_scale != 1.0:
680
  unconditional_conditioning = (
681
  self.cond_stage_model.get_unconditional_condition(batch_size)
682
  )
683
 
684
- fnames = list(super().get_input(batch, "fname"))
685
-
686
  samples, _ = self.sample_log(
687
  cond=c,
688
  batch_size=batch_size,
 
636
  ddim_steps=200,
637
  ddim_eta=1.0,
638
  x_T=None,
639
+ n_candidate_gen_per_text=1,
640
  unconditional_guidance_scale=1.0,
641
  unconditional_conditioning=None,
642
  name="waveform",
 
644
  save=False,
645
  **kwargs,
646
  ):
647
+ # Generate n_candidate_gen_per_text times and select the best
648
  # Batch: audio, text, fnames
649
  assert x_T is None
650
  try:
 
672
  text = super().get_input(batch, "text")
673
 
674
  # Generate multiple samples
675
+ batch_size = z.shape[0] * n_candidate_gen_per_text
676
+ c = torch.cat([c] * n_candidate_gen_per_text, dim=0)
677
+ text = text * n_candidate_gen_per_text
678
 
679
  if unconditional_guidance_scale != 1.0:
680
  unconditional_conditioning = (
681
  self.cond_stage_model.get_unconditional_condition(batch_size)
682
  )
683
 
 
 
684
  samples, _ = self.sample_log(
685
  cond=c,
686
  batch_size=batch_size,
audioldm/pipeline.py CHANGED
@@ -29,7 +29,7 @@ def make_batch_for_text_to_audio(text, batchsize=2):
29
  )
30
  return batch
31
 
32
- def text_to_audio(text, batchsize=2, guidance_scale=2.5, n_gen=1, config=None):
33
  if(torch.cuda.is_available()):
34
  device = torch.device("cuda:0")
35
  else:
@@ -57,13 +57,16 @@ def text_to_audio(text, batchsize=2, guidance_scale=2.5, n_gen=1, config=None):
57
  latent_diffusion = latent_diffusion.to(device)
58
 
59
  latent_diffusion.cond_stage_model.embed_mode = "text"
 
60
 
61
- batch = make_batch_for_text_to_audio(text, batchsize=batchsize)
62
 
 
 
63
  with torch.no_grad():
64
  waveform = latent_diffusion.generate_sample(
65
  [batch],
66
  unconditional_guidance_scale=guidance_scale,
67
- n_gen=n_gen,
 
68
  )
69
  return waveform
 
29
  )
30
  return batch
31
 
32
+ def build_model(config=None):
33
  if(torch.cuda.is_available()):
34
  device = torch.device("cuda:0")
35
  else:
 
57
  latent_diffusion = latent_diffusion.to(device)
58
 
59
  latent_diffusion.cond_stage_model.embed_mode = "text"
60
+ return latent_diffusion
61
 
 
62
 
63
+ def text_to_audio(latent_diffusion, text, duration=10, batchsize=2, guidance_scale=2.5, n_candidate_gen_per_text=3, config=None):
64
+ batch = make_batch_for_text_to_audio(text, batchsize=batchsize)
65
  with torch.no_grad():
66
  waveform = latent_diffusion.generate_sample(
67
  [batch],
68
  unconditional_guidance_scale=guidance_scale,
69
+ n_candidate_gen_per_text=n_candidate_gen_per_text,
70
+ duration=duration
71
  )
72
  return waveform