lmzjms commited on
Commit
cfba0dd
1 Parent(s): e6826f1

Update audio_foundation_models.py

Browse files
Files changed (1) hide show
  1. audio_foundation_models.py +2 -109
audio_foundation_models.py CHANGED
@@ -94,17 +94,7 @@ def select_best_audio(prompt,wav_list):
94
  print(score_list,max_index)
95
  return wav_list[max_index]
96
 
97
- def merge_audio(audio_path_1, audio_path_2):
98
- merged_signal = []
99
- sr_1, signal_1 = wavfile.read(audio_path_1)
100
- sr_2, signal_2 = wavfile.read(audio_path_2)
101
- merged_signal.append(signal_1)
102
- merged_signal.append(signal_2)
103
- merged_signal = np.hstack(merged_signal)
104
- merged_signal = np.asarray(merged_signal, dtype=np.int16)
105
- audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
106
- wavfile.write(audio_filename, sr_1, merged_signal)
107
- return audio_filename
108
  class T2I:
109
  def __init__(self, device):
110
  print("Initializing T2I to %s" % device)
@@ -525,10 +515,6 @@ class ASR:
525
  options = whisper.DecodingOptions()
526
  result = whisper.decode(self.model, mel, options)
527
  return result.text
528
-
529
- def translate_english(self, audio_path):
530
- audio = self.model.transcribe(audio_path, language='English')
531
- return audio['text']
532
 
533
  class A2T:
534
  def __init__(self, device):
@@ -818,97 +804,4 @@ class TargetSoundDetection:
818
  ans = ''
819
  for i,item in enumerate(time_predictions):
820
  ans = ans + 'segment' + str(i+1) + ' start_time: ' + str(item['onset']) + ' end_time: ' + str(item['offset']) + '\t'
821
- return ans
822
-
823
- class Speech_Enh_SC:
824
- """Speech Enhancement or Separation in single-channel
825
- Example usage:
826
- enh_model = Speech_Enh_SS("cuda")
827
- enh_wav = enh_model.inference("./test_chime4_audio_M05_440C0213_PED_REAL.wav")
828
- """
829
- def __init__(self, device="cuda", model_name="espnet/Wangyou_Zhang_chime4_enh_train_enh_conv_tasnet_raw"):
830
- self.model_name = model_name
831
- self.device = device
832
- print("Initializing ESPnet Enh to %s" % device)
833
- self._initialize_model()
834
-
835
- def _initialize_model(self):
836
- from espnet_model_zoo.downloader import ModelDownloader
837
- from espnet2.bin.enh_inference import SeparateSpeech
838
-
839
- d = ModelDownloader()
840
-
841
- cfg = d.download_and_unpack(self.model_name)
842
- self.separate_speech = SeparateSpeech(
843
- train_config=cfg["train_config"],
844
- model_file=cfg["model_file"],
845
- # for segment-wise process on long speech
846
- segment_size=2.4,
847
- hop_size=0.8,
848
- normalize_segment_scale=False,
849
- show_progressbar=True,
850
- ref_channel=None,
851
- normalize_output_wav=True,
852
- device=self.device,
853
- )
854
-
855
- @prompts(name="Speech Enhancement In Single-Channel",
856
- description="useful for when you want to enhance the quality of the speech signal by reducing background noise (single-channel), "
857
- "receives audio_path as input."
858
- "The input to this tool should be a string, "
859
- "representing the audio_path. " )
860
-
861
- def inference(self, speech_path, ref_channel=0):
862
- speech, sr = soundfile.read(speech_path)
863
- speech = speech[:, ref_channel]
864
- enh_speech = self.separate_speech(speech[None, ...], fs=sr)
865
- audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
866
- soundfile.write(audio_filename, enh_speech[0].squeeze(), samplerate=sr)
867
- return audio_filename
868
-
869
- class Speech_SS:
870
- def __init__(self, device="cuda", model_name="lichenda/wsj0_2mix_skim_noncausal"):
871
- self.model_name = model_name
872
- self.device = device
873
- print("Initializing ESPnet SS to %s" % device)
874
- self._initialize_model()
875
-
876
- def _initialize_model(self):
877
- from espnet_model_zoo.downloader import ModelDownloader
878
- from espnet2.bin.enh_inference import SeparateSpeech
879
-
880
- d = ModelDownloader()
881
-
882
- cfg = d.download_and_unpack(self.model_name)
883
- self.separate_speech = SeparateSpeech(
884
- train_config=cfg["train_config"],
885
- model_file=cfg["model_file"],
886
- # for segment-wise process on long speech
887
- segment_size=2.4,
888
- hop_size=0.8,
889
- normalize_segment_scale=False,
890
- show_progressbar=True,
891
- ref_channel=None,
892
- normalize_output_wav=True,
893
- device=self.device,
894
- )
895
-
896
- @prompts(name="Speech Separation",
897
- description="useful for when you want to separate each speech from the speech mixture, "
898
- "receives audio_path as input."
899
- "The input to this tool should be a string, "
900
- "representing the audio_path. " )
901
-
902
- def inference(self, speech_path):
903
- speech, sr = soundfile.read(speech_path)
904
- enh_speech = self.separate_speech(speech[None, ...], fs=sr)
905
- audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
906
- if len(enh_speech) == 1:
907
- soundfile.write(audio_filename, enh_speech[0].squeeze(), samplerate=sr)
908
- else:
909
- audio_filename_1 = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
910
- soundfile.write(audio_filename_1, enh_speech[0].squeeze(), samplerate=sr)
911
- audio_filename_2 = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
912
- soundfile.write(audio_filename_2, enh_speech[1].squeeze(), samplerate=sr)
913
- audio_filename = merge_audio(audio_filename_1, audio_filename_2)
914
- return audio_filename
 
94
  print(score_list,max_index)
95
  return wav_list[max_index]
96
 
97
+
 
 
 
 
 
 
 
 
 
 
98
  class T2I:
99
  def __init__(self, device):
100
  print("Initializing T2I to %s" % device)
 
515
  options = whisper.DecodingOptions()
516
  result = whisper.decode(self.model, mel, options)
517
  return result.text
 
 
 
 
518
 
519
  class A2T:
520
  def __init__(self, device):
 
804
  ans = ''
805
  for i,item in enumerate(time_predictions):
806
  ans = ans + 'segment' + str(i+1) + ' start_time: ' + str(item['onset']) + ' end_time: ' + str(item['offset']) + '\t'
807
+ return ans