lmzjms commited on
Commit
c52f81e
1 Parent(s): 988947c

Update audio_foundation_models.py

Browse files
Files changed (1) hide show
  1. audio_foundation_models.py +33 -29
audio_foundation_models.py CHANGED
@@ -135,11 +135,6 @@ class T2A:
135
  self.sampler = initialize_model('text_to_audio/Make_An_Audio/configs/text-to-audio/txt2audio_args.yaml', 'text_to_audio/Make_An_Audio/useful_ckpts/ta40multi_epoch=000085.ckpt', device=device)
136
  self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio/vocoder/logs/bigv16k53w',device=device)
137
 
138
- @prompts(name="Generate Audio From User Input Text",
139
- description="useful for when you want to generate an audio "
140
- "from a user input text and it saved it to a file."
141
- "The input to this tool should be a string, "
142
- "representing the text used to generate audio.")
143
 
144
  def txt2audio(self, text, seed = 55, scale = 1.5, ddim_steps = 100, n_samples = 3, W = 624, H = 80):
145
  SAMPLE_RATE = 16000
@@ -168,6 +163,12 @@ class T2A:
168
  best_wav = select_best_audio(text, wav_list)
169
  return best_wav
170
 
 
 
 
 
 
 
171
  def inference(self, text, seed = 55, scale = 1.5, ddim_steps = 100, n_samples = 3, W = 624, H = 80):
172
  melbins,mel_len = 80,624
173
  with torch.no_grad():
@@ -188,11 +189,6 @@ class I2A:
188
  self.sampler = initialize_model('text_to_audio/Make_An_Audio/configs/img_to_audio/img2audio_args.yaml', 'text_to_audio/Make_An_Audio/useful_ckpts/ta54_epoch=000216.ckpt', device=device)
189
  self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio/vocoder/logs/bigv16k53w',device=device)
190
 
191
- @prompts(name="Generate Audio From The Image",
192
- description="useful for when you want to generate an audio "
193
- "based on an image. "
194
- "The input to this tool should be a string, "
195
- "representing the image_path. ")
196
 
197
  def img2audio(self, image, seed = 55, scale = 3, ddim_steps = 100, W = 624, H = 80):
198
  SAMPLE_RATE = 16000
@@ -224,6 +220,13 @@ class I2A:
224
  wav_list.append((SAMPLE_RATE,wav))
225
  best_wav = wav_list[0]
226
  return best_wav
 
 
 
 
 
 
 
227
  def inference(self, image, seed = 55, scale = 3, ddim_steps = 100, W = 624, H = 80):
228
  melbins,mel_len = 80,624
229
  with torch.no_grad():
@@ -247,7 +250,6 @@ class TTS:
247
  "representing the text used to be converted to speech.")
248
 
249
  def inference(self, text):
250
- global temp_audio_filename
251
  inp = {"text": text}
252
  out = self.inferencer.infer_once(inp)
253
  audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
@@ -270,6 +272,11 @@ class T2S:
270
  'notes_duration': '0.113740 | 0.329060 | 0.287950 | 0.133480 | 0.150900 | 0.484730 | 0.242010 | 0.180820 | 0.343570 | 0.152050 | 0.266720 | 0.280310 | 0.633300 | 0.444590'
271
  }
272
 
 
 
 
 
 
273
  @prompts(name="Generate Singing Voice From User Input Text, Note and Duration Sequence",
274
  description="useful for when you want to generate a piece of singing voice (Optional: from User Input Text, Note and Duration Sequence) "
275
  "and save it to a file."
@@ -278,11 +285,7 @@ class T2S:
278
  "Or Like: Generate a piece of singing voice. Text is xxx, note is xxx, duration is xxx."
279
  "The input to this tool should be a comma seperated string of three, "
280
  "representing text, note and duration sequence since User Input Text, Note and Duration Sequence are all provided. ")
281
-
282
- def set_model_hparams(self):
283
- set_hparams(config=self.config, exp_name=self.exp_name, print_hparams=False)
284
- self.hp = hp
285
-
286
  def inference(self, inputs):
287
  self.set_model_hparams()
288
  val = inputs.split(",")
@@ -311,13 +314,6 @@ class TTS_OOD:
311
  self.set_model_hparams()
312
  self.pipe = GenerSpeechInfer(self.hp, device)
313
 
314
- @prompts(name="Style Transfer",
315
- description="useful for when you want to generate speech samples with styles "
316
- "(e.g., timbre, emotion, and prosody) derived from a reference custom voice. "
317
- "Like: Generate a speech with style transferred from this voice. The text is xxx., or speak using the voice of this audio. The text is xxx."
318
- "The input to this tool should be a comma seperated string of two, "
319
- "representing reference audio path and input text. " )
320
-
321
  def set_model_hparams(self):
322
  set_hparams(config=self.config, exp_name=self.exp_name, print_hparams=False)
323
  f0_stats_fn = f'{hp["binary_data_dir"]}/train_f0s_mean_std.npy'
@@ -328,6 +324,13 @@ class TTS_OOD:
328
  hp['emotion_encoder_path'] = 'checkpoints/Emotion_encoder.pt'
329
  self.hp = hp
330
 
 
 
 
 
 
 
 
331
  def inference(self, inputs):
332
  self.set_model_hparams()
333
  key = ['ref_audio', 'text']
@@ -349,12 +352,6 @@ class Inpaint:
349
  self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio/vocoder/logs/bigv16k53w',device=device)
350
  self.cmap_transform = matplotlib.cm.viridis
351
 
352
- @prompts(name="Audio Inpainting",
353
- description="useful for when you want to inpaint a mel spectrum of an audio and predict this audio, "
354
- "this tool will generate a mel spectrum and you can inpaint it, receives audio_path as input. "
355
- "The input to this tool should be a string, "
356
- "representing the audio_path. " )
357
-
358
  def make_batch_sd(self, mel, mask, num_samples=1):
359
 
360
  mel = torch.from_numpy(mel)[None,None,...].to(dtype=torch.float32)
@@ -471,6 +468,13 @@ class Inpaint:
471
  audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
472
  soundfile.write(audio_filename, gen_wav, samplerate = 16000)
473
  return image_filename, audio_filename
 
 
 
 
 
 
 
474
  def inference(self, input_audio_path):
475
  crop_len = 500 # the full mel cannot be showed due to gradio's Image bug when using tool='sketch'
476
  crop_mel = self.gen_mel(input_audio_path)[:,:crop_len]
 
135
  self.sampler = initialize_model('text_to_audio/Make_An_Audio/configs/text-to-audio/txt2audio_args.yaml', 'text_to_audio/Make_An_Audio/useful_ckpts/ta40multi_epoch=000085.ckpt', device=device)
136
  self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio/vocoder/logs/bigv16k53w',device=device)
137
 
 
 
 
 
 
138
 
139
  def txt2audio(self, text, seed = 55, scale = 1.5, ddim_steps = 100, n_samples = 3, W = 624, H = 80):
140
  SAMPLE_RATE = 16000
 
163
  best_wav = select_best_audio(text, wav_list)
164
  return best_wav
165
 
166
+ @prompts(name="Generate Audio From User Input Text",
167
+ description="useful for when you want to generate an audio "
168
+ "from a user input text and it saved it to a file."
169
+ "The input to this tool should be a string, "
170
+ "representing the text used to generate audio.")
171
+
172
  def inference(self, text, seed = 55, scale = 1.5, ddim_steps = 100, n_samples = 3, W = 624, H = 80):
173
  melbins,mel_len = 80,624
174
  with torch.no_grad():
 
189
  self.sampler = initialize_model('text_to_audio/Make_An_Audio/configs/img_to_audio/img2audio_args.yaml', 'text_to_audio/Make_An_Audio/useful_ckpts/ta54_epoch=000216.ckpt', device=device)
190
  self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio/vocoder/logs/bigv16k53w',device=device)
191
 
 
 
 
 
 
192
 
193
  def img2audio(self, image, seed = 55, scale = 3, ddim_steps = 100, W = 624, H = 80):
194
  SAMPLE_RATE = 16000
 
220
  wav_list.append((SAMPLE_RATE,wav))
221
  best_wav = wav_list[0]
222
  return best_wav
223
+
224
+ @prompts(name="Generate Audio From The Image",
225
+ description="useful for when you want to generate an audio "
226
+ "based on an image. "
227
+ "The input to this tool should be a string, "
228
+ "representing the image_path. ")
229
+
230
  def inference(self, image, seed = 55, scale = 3, ddim_steps = 100, W = 624, H = 80):
231
  melbins,mel_len = 80,624
232
  with torch.no_grad():
 
250
  "representing the text used to be converted to speech.")
251
 
252
  def inference(self, text):
 
253
  inp = {"text": text}
254
  out = self.inferencer.infer_once(inp)
255
  audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
 
272
  'notes_duration': '0.113740 | 0.329060 | 0.287950 | 0.133480 | 0.150900 | 0.484730 | 0.242010 | 0.180820 | 0.343570 | 0.152050 | 0.266720 | 0.280310 | 0.633300 | 0.444590'
273
  }
274
 
275
+
276
+ def set_model_hparams(self):
277
+ set_hparams(config=self.config, exp_name=self.exp_name, print_hparams=False)
278
+ self.hp = hp
279
+
280
  @prompts(name="Generate Singing Voice From User Input Text, Note and Duration Sequence",
281
  description="useful for when you want to generate a piece of singing voice (Optional: from User Input Text, Note and Duration Sequence) "
282
  "and save it to a file."
 
285
  "Or Like: Generate a piece of singing voice. Text is xxx, note is xxx, duration is xxx."
286
  "The input to this tool should be a comma seperated string of three, "
287
  "representing text, note and duration sequence since User Input Text, Note and Duration Sequence are all provided. ")
288
+
 
 
 
 
289
  def inference(self, inputs):
290
  self.set_model_hparams()
291
  val = inputs.split(",")
 
314
  self.set_model_hparams()
315
  self.pipe = GenerSpeechInfer(self.hp, device)
316
 
 
 
 
 
 
 
 
317
  def set_model_hparams(self):
318
  set_hparams(config=self.config, exp_name=self.exp_name, print_hparams=False)
319
  f0_stats_fn = f'{hp["binary_data_dir"]}/train_f0s_mean_std.npy'
 
324
  hp['emotion_encoder_path'] = 'checkpoints/Emotion_encoder.pt'
325
  self.hp = hp
326
 
327
+ @prompts(name="Style Transfer",
328
+ description="useful for when you want to generate speech samples with styles "
329
+ "(e.g., timbre, emotion, and prosody) derived from a reference custom voice. "
330
+ "Like: Generate a speech with style transferred from this voice. The text is xxx., or speak using the voice of this audio. The text is xxx."
331
+ "The input to this tool should be a comma seperated string of two, "
332
+ "representing reference audio path and input text. " )
333
+
334
  def inference(self, inputs):
335
  self.set_model_hparams()
336
  key = ['ref_audio', 'text']
 
352
  self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio/vocoder/logs/bigv16k53w',device=device)
353
  self.cmap_transform = matplotlib.cm.viridis
354
 
 
 
 
 
 
 
355
  def make_batch_sd(self, mel, mask, num_samples=1):
356
 
357
  mel = torch.from_numpy(mel)[None,None,...].to(dtype=torch.float32)
 
468
  audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
469
  soundfile.write(audio_filename, gen_wav, samplerate = 16000)
470
  return image_filename, audio_filename
471
+
472
+ @prompts(name="Audio Inpainting",
473
+ description="useful for when you want to inpaint a mel spectrum of an audio and predict this audio, "
474
+ "this tool will generate a mel spectrum and you can inpaint it, receives audio_path as input. "
475
+ "The input to this tool should be a string, "
476
+ "representing the audio_path. " )
477
+
478
  def inference(self, input_audio_path):
479
  crop_len = 500 # the full mel cannot be showed due to gradio's Image bug when using tool='sketch'
480
  crop_mel = self.gen_mel(input_audio_path)[:,:crop_len]