haoheliu commited on
Commit
412929c
1 Parent(s): 39711bd

update code

Browse files
Files changed (4) hide show
  1. .gitignore +2 -1
  2. app.py +4 -3
  3. audioldm/ldm.py +1 -1
  4. audioldm/pipeline.py +9 -3
.gitignore CHANGED
@@ -1,3 +1,4 @@
1
  *.pyc
2
  __pycache__
3
- test.py
 
1
  *.pyc
2
  __pycache__
3
+ test.py
4
+ flagged
app.py CHANGED
@@ -4,17 +4,18 @@ from audioldm import text_to_audio, seed_everything, build_model
4
 
5
  audioldm = build_model()
6
 
7
- def text2audio(text, duration, guidance_scale):
8
  # print(text, length, guidance_scale)
9
- waveform = text_to_audio(audioldm, text, duration=duration, guidance_scale=guidance_scale, n_candidate_gen_per_text=1) # [bs, 1, samples]
10
  waveform = [(16000, wave[0]) for wave in waveform]
11
  # waveform = [(16000, np.random.randn(16000)), (16000, np.random.randn(16000))]
12
  return waveform
13
 
14
  iface = gr.Interface(fn=text2audio, inputs=[
15
  gr.Textbox(value="A man is speaking in a huge room", max_lines=1),
16
- gr.Slider(2, 15, value=5, step=0.1),
17
  gr.Slider(0, 5, value=2.5, step=0.5),
 
18
  ], outputs=[gr.Audio(label="Output", type="numpy"), gr.Audio(label="Output", type="numpy")]
19
  )
20
  iface.launch(share=True)
4
 
5
  audioldm = build_model()
6
 
7
+ def text2audio(text, duration, guidance_scale, random_seed):
8
  # print(text, length, guidance_scale)
9
+ waveform = text_to_audio(audioldm, text, random_seed, duration=duration, guidance_scale=guidance_scale, n_candidate_gen_per_text=1) # [bs, 1, samples]
10
  waveform = [(16000, wave[0]) for wave in waveform]
11
  # waveform = [(16000, np.random.randn(16000)), (16000, np.random.randn(16000))]
12
  return waveform
13
 
14
  iface = gr.Interface(fn=text2audio, inputs=[
15
  gr.Textbox(value="A man is speaking in a huge room", max_lines=1),
16
+ gr.Slider(2.5, 10, value=5, step=2.5),
17
  gr.Slider(0, 5, value=2.5, step=0.5),
18
+ gr.Number(value=42)
19
  ], outputs=[gr.Audio(label="Output", type="numpy"), gr.Audio(label="Output", type="numpy")]
20
  )
21
  iface.launch(share=True)
audioldm/ldm.py CHANGED
@@ -659,7 +659,7 @@ class LatentDiffusion(DDPM):
659
  # os.makedirs(waveform_save_path, exist_ok=True)
660
  # print("Waveform save path: ", waveform_save_path)
661
 
662
- with self.ema_scope("Plotting"):
663
  for batch in batchs:
664
  z, c = self.get_input(
665
  batch,
659
  # os.makedirs(waveform_save_path, exist_ok=True)
660
  # print("Waveform save path: ", waveform_save_path)
661
 
662
+ with self.ema_scope("Generate"):
663
  for batch in batchs:
664
  z, c = self.get_input(
665
  batch,
audioldm/pipeline.py CHANGED
@@ -6,9 +6,10 @@ import argparse
6
  import yaml
7
  import torch
8
 
9
- from audioldm import LatentDiffusion
10
  from audioldm.utils import default_audioldm_config
11
 
 
12
  import time
13
 
14
  def make_batch_for_text_to_audio(text, batchsize=2):
@@ -18,7 +19,7 @@ def make_batch_for_text_to_audio(text, batchsize=2):
18
  fbank = torch.zeros((batchsize, 1024, 64)) # Not used, here to keep the code format
19
  stft = torch.zeros((batchsize, 1024, 512)) # Not used
20
  waveform = torch.zeros((batchsize, 160000)) # Not used
21
- fname = ["%s.wav" % x for x in range(batchsize)]
22
  batch = (
23
  fbank,
24
  stft,
@@ -59,9 +60,14 @@ def build_model(config=None):
59
  latent_diffusion.cond_stage_model.embed_mode = "text"
60
  return latent_diffusion
61
 
 
 
62
 
63
- def text_to_audio(latent_diffusion, text, duration=10, batchsize=2, guidance_scale=2.5, n_candidate_gen_per_text=3, config=None):
 
64
  batch = make_batch_for_text_to_audio(text, batchsize=batchsize)
 
 
65
  with torch.no_grad():
66
  waveform = latent_diffusion.generate_sample(
67
  [batch],
6
  import yaml
7
  import torch
8
 
9
+ from audioldm import LatentDiffusion, seed_everything
10
  from audioldm.utils import default_audioldm_config
11
 
12
+
13
  import time
14
 
15
  def make_batch_for_text_to_audio(text, batchsize=2):
19
  fbank = torch.zeros((batchsize, 1024, 64)) # Not used, here to keep the code format
20
  stft = torch.zeros((batchsize, 1024, 512)) # Not used
21
  waveform = torch.zeros((batchsize, 160000)) # Not used
22
+ fname = [""] * batchsize # Not used
23
  batch = (
24
  fbank,
25
  stft,
60
  latent_diffusion.cond_stage_model.embed_mode = "text"
61
  return latent_diffusion
62
 
63
+ def duration_to_latent_t_size(duration):
64
+ return int(duration * 25.6)
65
 
66
+ def text_to_audio(latent_diffusion, text, seed=42, duration=10, batchsize=2, guidance_scale=2.5, n_candidate_gen_per_text=3, config=None):
67
+ seed_everything(int(seed))
68
  batch = make_batch_for_text_to_audio(text, batchsize=batchsize)
69
+
70
+ latent_diffusion.latent_t_size = duration_to_latent_t_size(duration)
71
  with torch.no_grad():
72
  waveform = latent_diffusion.generate_sample(
73
  [batch],