haoheliu commited on
Commit
4e9d8a1
1 Parent(s): c55c219

two output to one output

Browse files
app.py CHANGED
@@ -30,7 +30,10 @@ def text2audio(text, duration, guidance_scale, random_seed, n_candidates):
30
  waveform = text_to_audio(audioldm, text, random_seed, duration=duration, guidance_scale=guidance_scale, n_candidate_gen_per_text=int(n_candidates)) # [bs, 1, samples]
31
  waveform = [(16000, wave[0]) for wave in waveform]
32
  # waveform = [(16000, np.random.randn(16000)), (16000, np.random.randn(16000))]
33
- return waveform
 
 
 
34
 
35
  # iface = gr.Interface(fn=text2audio, inputs=[
36
  # gr.Textbox(value="A man is speaking in a huge room", max_lines=1),
@@ -71,13 +74,14 @@ with iface:
71
  ############# Input
72
  textbox = gr.Textbox(value="A hammer is hitting a wooden surface", max_lines=1)
73
 
74
- with gr.Accordion("Click to change detailed configurations", open=False):
75
  seed = gr.Number(value=42, label="Change this value (any integer number) will lead to a different generation result.")
76
  duration = gr.Slider(2.5, 10, value=5, step=2.5, label="Duration (seconds)")
77
  guidance_scale = gr.Slider(0, 5, value=2.5, step=0.5, label="Guidance scale (Large => better quality and relavancy to text; Small => better diversity)")
78
  n_candidates = gr.Slider(1, 5, value=3, step=1, label="Automatic quality control. This number control the number of candidates (e.g., generate three audios and choose the best to show you). A Larger value usually lead to better quality with heavier computation")
79
  ############# Output
80
- outputs=[gr.Audio(label="Output", type="numpy"), gr.Audio(label="Output", type="numpy")]
 
81
 
82
  btn = gr.Button("Submit").style(full_width=True)
83
  btn.click(text2audio, inputs=[textbox, duration, guidance_scale, seed, n_candidates], outputs=outputs)
@@ -89,6 +93,6 @@ with iface:
89
  </div>
90
  ''')
91
 
92
- iface.queue(concurrency_count=2)
93
  iface.launch(debug=True)
94
  # iface.launch(debug=True, share=True)
 
30
  waveform = text_to_audio(audioldm, text, random_seed, duration=duration, guidance_scale=guidance_scale, n_candidate_gen_per_text=int(n_candidates)) # [bs, 1, samples]
31
  waveform = [(16000, wave[0]) for wave in waveform]
32
  # waveform = [(16000, np.random.randn(16000)), (16000, np.random.randn(16000))]
33
+ if(len(waveform) == 1):
34
+ return waveform[0]
35
+ else:
36
+ return waveform
37
 
38
  # iface = gr.Interface(fn=text2audio, inputs=[
39
  # gr.Textbox(value="A man is speaking in a huge room", max_lines=1),
 
74
  ############# Input
75
  textbox = gr.Textbox(value="A hammer is hitting a wooden surface", max_lines=1)
76
 
77
+ with gr.Accordion("Click to modify detailed configurations", open=False):
78
  seed = gr.Number(value=42, label="Change this value (any integer number) will lead to a different generation result.")
79
  duration = gr.Slider(2.5, 10, value=5, step=2.5, label="Duration (seconds)")
80
  guidance_scale = gr.Slider(0, 5, value=2.5, step=0.5, label="Guidance scale (Large => better quality and relavancy to text; Small => better diversity)")
81
  n_candidates = gr.Slider(1, 5, value=3, step=1, label="Automatic quality control. This number control the number of candidates (e.g., generate three audios and choose the best to show you). A Larger value usually lead to better quality with heavier computation")
82
  ############# Output
83
+ outputs=[gr.Audio(label="Output", type="numpy")]
84
+ # outputs=[gr.Audio(label="Output", type="numpy"), gr.Audio(label="Output", type="numpy")]
85
 
86
  btn = gr.Button("Submit").style(full_width=True)
87
  btn.click(text2audio, inputs=[textbox, duration, guidance_scale, seed, n_candidates], outputs=outputs)
 
93
  </div>
94
  ''')
95
 
96
+ iface.queue(concurrency_count = 2)
97
  iface.launch(debug=True)
98
  # iface.launch(debug=True, share=True)
audioldm/clap/open_clip/model.py CHANGED
@@ -745,6 +745,8 @@ class CLAP(nn.Module):
745
  device = next(self.parameters()).device
746
  for k in data:
747
  data[k] = data[k].to(device)
 
 
748
  text_embeds = self.encode_text(data, device=device)
749
  text_embeds = F.normalize(text_embeds, dim=-1)
750
 
 
745
  device = next(self.parameters()).device
746
  for k in data:
747
  data[k] = data[k].to(device)
748
+ if(len(data[k].size()) < 2):
749
+ data[k] = data[k].unsqueeze(0)
750
  text_embeds = self.encode_text(data, device=device)
751
  text_embeds = F.normalize(text_embeds, dim=-1)
752
 
audioldm/ldm.py CHANGED
@@ -697,18 +697,19 @@ class LatentDiffusion(DDPM):
697
 
698
  waveform = self.mel_spectrogram_to_waveform(mel)
699
 
700
- similarity = self.cond_stage_model.cos_similarity(
701
- torch.FloatTensor(waveform).squeeze(1), text
702
- )
 
703
 
704
- best_index = []
705
- for i in range(z.shape[0]):
706
- candidates = similarity[i :: z.shape[0]]
707
- max_index = torch.argmax(candidates).item()
708
- best_index.append(i + max_index * z.shape[0])
709
 
710
- waveform = waveform[best_index]
711
- # print("Similarity between generated audio and text", similarity)
712
- # print("Choose the following indexes:", best_index)
713
-
714
  return waveform
 
697
 
698
  waveform = self.mel_spectrogram_to_waveform(mel)
699
 
700
+ if(waveform.shape[0] > 1):
701
+ similarity = self.cond_stage_model.cos_similarity(
702
+ torch.FloatTensor(waveform).squeeze(1), text
703
+ )
704
 
705
+ best_index = []
706
+ for i in range(z.shape[0]):
707
+ candidates = similarity[i :: z.shape[0]]
708
+ max_index = torch.argmax(candidates).item()
709
+ best_index.append(i + max_index * z.shape[0])
710
 
711
+ waveform = waveform[best_index]
712
+ # print("Similarity between generated audio and text", similarity)
713
+ # print("Choose the following indexes:", best_index)
714
+
715
  return waveform
audioldm/pipeline.py CHANGED
@@ -12,10 +12,10 @@ from audioldm.utils import default_audioldm_config
12
 
13
  import time
14
 
15
- def make_batch_for_text_to_audio(text, batchsize=2):
16
  text = [text] * batchsize
17
- if batchsize < 2:
18
- print("Warning: Batchsize must be at least 2. Batchsize is set to 2.")
19
  fbank = torch.zeros((batchsize, 1024, 64)) # Not used, here to keep the code format
20
  stft = torch.zeros((batchsize, 1024, 512)) # Not used
21
  waveform = torch.zeros((batchsize, 160000)) # Not used
@@ -63,7 +63,7 @@ def build_model(config=None):
63
  def duration_to_latent_t_size(duration):
64
  return int(duration * 25.6)
65
 
66
- def text_to_audio(latent_diffusion, text, seed=42, duration=10, batchsize=2, guidance_scale=2.5, n_candidate_gen_per_text=3, config=None):
67
  seed_everything(int(seed))
68
  batch = make_batch_for_text_to_audio(text, batchsize=batchsize)
69
 
 
12
 
13
  import time
14
 
15
+ def make_batch_for_text_to_audio(text, batchsize=1):
16
  text = [text] * batchsize
17
+ if batchsize < 1:
18
+ print("Warning: Batchsize must be at least 1. Batchsize is set to .")
19
  fbank = torch.zeros((batchsize, 1024, 64)) # Not used, here to keep the code format
20
  stft = torch.zeros((batchsize, 1024, 512)) # Not used
21
  waveform = torch.zeros((batchsize, 160000)) # Not used
 
63
  def duration_to_latent_t_size(duration):
64
  return int(duration * 25.6)
65
 
66
+ def text_to_audio(latent_diffusion, text, seed=42, duration=10, batchsize=1, guidance_scale=2.5, n_candidate_gen_per_text=3, config=None):
67
  seed_everything(int(seed))
68
  batch = make_batch_for_text_to_audio(text, batchsize=batchsize)
69
 
requirements.txt CHANGED
@@ -11,6 +11,7 @@ numpy<=1.23.5
11
  soundfile
12
  librosa
13
  pandas
 
14
  torchlibrosa
15
  transformers
16
  ftfy
 
11
  soundfile
12
  librosa
13
  pandas
14
+ # transformers
15
  torchlibrosa
16
  transformers
17
  ftfy