lmzjms commited on
Commit
e2ad87f
·
1 Parent(s): f608a35

Update audio_foundation_models.py

Browse files
Files changed (1) hide show
  1. audio_foundation_models.py +94 -1
audio_foundation_models.py CHANGED
@@ -808,4 +808,97 @@ class TargetSoundDetection:
808
  ans = ''
809
  for i,item in enumerate(time_predictions):
810
  ans = ans + 'segment' + str(i+1) + ' start_time: ' + str(item['onset']) + ' end_time: ' + str(item['offset']) + '\t'
811
- return ans
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
808
  ans = ''
809
  for i,item in enumerate(time_predictions):
810
  ans = ans + 'segment' + str(i+1) + ' start_time: ' + str(item['onset']) + ' end_time: ' + str(item['offset']) + '\t'
811
+ return ans
812
+
813
+ class Speech_Enh_SC:
814
+ """Speech Enhancement or Separation in single-channel
815
+ Example usage:
816
+ enh_model = Speech_Enh_SS("cuda")
817
+ enh_wav = enh_model.inference("./test_chime4_audio_M05_440C0213_PED_REAL.wav")
818
+ """
819
+ def __init__(self, device="cuda", model_name="espnet/Wangyou_Zhang_chime4_enh_train_enh_conv_tasnet_raw"):
820
+ self.model_name = model_name
821
+ self.device = device
822
+ print("Initializing ESPnet Enh to %s" % device)
823
+ self._initialize_model()
824
+
825
+ def _initialize_model(self):
826
+ from espnet_model_zoo.downloader import ModelDownloader
827
+ from espnet2.bin.enh_inference import SeparateSpeech
828
+
829
+ d = ModelDownloader()
830
+
831
+ cfg = d.download_and_unpack(self.model_name)
832
+ self.separate_speech = SeparateSpeech(
833
+ train_config=cfg["train_config"],
834
+ model_file=cfg["model_file"],
835
+ # for segment-wise process on long speech
836
+ segment_size=2.4,
837
+ hop_size=0.8,
838
+ normalize_segment_scale=False,
839
+ show_progressbar=True,
840
+ ref_channel=None,
841
+ normalize_output_wav=True,
842
+ device=self.device,
843
+ )
844
+
845
+ @prompts(name="Speech Enhancement In Single-Channel",
846
+ description="useful for when you want to enhance the quality of the speech signal by reducing background noise (single-channel), "
847
+ "receives audio_path as input."
848
+ "The input to this tool should be a string, "
849
+ "representing the audio_path. " )
850
+
851
+ def inference(self, speech_path, ref_channel=0):
852
+ speech, sr = soundfile.read(speech_path)
853
+ speech = speech[:, ref_channel]
854
+ enh_speech = self.separate_speech(speech[None, ...], fs=sr)
855
+ audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
856
+ soundfile.write(audio_filename, enh_speech[0].squeeze(), samplerate=sr)
857
+ return audio_filename
858
+
859
+ class Speech_SS:
860
+ def __init__(self, device="cuda", model_name="lichenda/wsj0_2mix_skim_noncausal"):
861
+ self.model_name = model_name
862
+ self.device = device
863
+ print("Initializing ESPnet SS to %s" % device)
864
+ self._initialize_model()
865
+
866
+ def _initialize_model(self):
867
+ from espnet_model_zoo.downloader import ModelDownloader
868
+ from espnet2.bin.enh_inference import SeparateSpeech
869
+
870
+ d = ModelDownloader()
871
+
872
+ cfg = d.download_and_unpack(self.model_name)
873
+ self.separate_speech = SeparateSpeech(
874
+ train_config=cfg["train_config"],
875
+ model_file=cfg["model_file"],
876
+ # for segment-wise process on long speech
877
+ segment_size=2.4,
878
+ hop_size=0.8,
879
+ normalize_segment_scale=False,
880
+ show_progressbar=True,
881
+ ref_channel=None,
882
+ normalize_output_wav=True,
883
+ device=self.device,
884
+ )
885
+
886
+ @prompts(name="Speech Separation",
887
+ description="useful for when you want to separate each speech from the speech mixture, "
888
+ "receives audio_path as input."
889
+ "The input to this tool should be a string, "
890
+ "representing the audio_path. " )
891
+
892
+ def inference(self, speech_path):
893
+ speech, sr = soundfile.read(speech_path)
894
+ enh_speech = self.separate_speech(speech[None, ...], fs=sr)
895
+ audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
896
+ if len(enh_speech) == 1:
897
+ soundfile.write(audio_filename, enh_speech[0].squeeze(), samplerate=sr)
898
+ else:
899
+ audio_filename_1 = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
900
+ soundfile.write(audio_filename_1, enh_speech[0].squeeze(), samplerate=sr)
901
+ audio_filename_2 = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
902
+ soundfile.write(audio_filename_2, enh_speech[1].squeeze(), samplerate=sr)
903
+ audio_filename = merge_audio(audio_filename_1, audio_filename_2)
904
+ return audio_filename