Spaces:

Maximofn
/

subtify

Running

App Files Files Community

Maximofn commited on Oct 25, 2023

Commit

1215bc9

1 Parent(s): 6d86b70

Into separe_vocals.py set modelscope and speechbrain methods

Browse files

Files changed (1) hide show

separe_vocals.py +75 -21

separe_vocals.py CHANGED Viewed

@@ -5,8 +5,16 @@ import numpy as np
 import os
 import torch
 import argparse
 SAMPLE_RATE = 8000
 def get_sample_rate(audio_file_path):
     """
@@ -51,27 +59,42 @@ def set_mono(input_audio_file_path, output_audio_file_path):
     """
     os.system(f'ffmpeg -i {input_audio_file_path} -ac 1 -loglevel error {output_audio_file_path}')
-def main(args):
-    # Get input and output files
-    input = args.input
-    output = args.input
     # Get input and output names
-    input_name = input.split(".")[0]
-    output_name = output.split(".")[0]
-    # Get folder of output file
-    input_folder = input_name.split("/")[0]
-    output_folder = "vocals"
-    input_file_name = input_name.split("/")[1]
-    output_file_name = output_name.split("/")[1]
     # Set input files with 8k sample rate and mono
-    input_8k = f"{input_name}_8k.wav"
-    input_8k_mono = f"{input_name}_8k_mono.wav"
     # Check if input has 8k sample rate, if not, change it
-    sr = get_sample_rate(input)
     if sr != SAMPLE_RATE:
         change_sample_rate(input, input_8k, SAMPLE_RATE)
         remove_8k = True
@@ -88,14 +111,17 @@ def main(args):
         remove_mono = False
     # Separate audio voices
-    device = 'cuda' if torch.cuda.is_available() else 'cpu'
-    separation = pipeline(Tasks.speech_separation, model='damo/speech_mossformer_separation_temporal_8k', device=device)
-    result = separation(input_8k_mono)
     # Save separated audio voices
     for i, signal in enumerate(result['output_pcm_list']):
-        save_file = f'{output_folder}/{output_file_name}_speaker{i:003d}.wav'
         sf.write(save_file, np.frombuffer(signal, dtype=np.int16), SAMPLE_RATE)
     # Remove temporary files
     if remove_8k:
@@ -105,6 +131,34 @@ def main(args):
 if __name__ == '__main__':
     argparser = argparse.ArgumentParser(description='Separate speech from a stereo audio file')
-    argparser.add_argument('input', type=str, help='Input audio file')
     args = argparser.parse_args()
-    main(args)

 import os
 import torch
 import argparse
+import speechbrain as sb
+from speechbrain.dataio.dataio import read_audio
+from speechbrain.pretrained import SepformerSeparation as separator
+import torchaudio
 SAMPLE_RATE = 8000
+MODEL_SPEECHBRAIN = "SPEECHBRAIN"
+MODEL_MODELSCOPE = "MODELSCOPE"
+# MODEL = MODEL_SPEECHBRAIN
+MODEL = MODEL_MODELSCOPE
 def get_sample_rate(audio_file_path):
     """
     """
     os.system(f'ffmpeg -i {input_audio_file_path} -ac 1 -loglevel error {output_audio_file_path}')
+def write_number_speakers_txt(output_folder, num_speakers):
+    """
+    Write the number of speakers in a txt file
+    Args:
+        output_folder (str): Path to the output folder
+        num_speakers (int): Number of speakers
+    """
+    with open(f"{output_folder}/speakers.txt", 'w') as f:
+        f.write(str(num_speakers))
+def separate_vocals_speechbrain(input_audio_file_path, output_folder, model):
+    file, _ = input_audio_file_path.split(".")
+    _, file = file.split("/")
+    est_sources = model.separate_file(path=input_audio_file_path)
+    num_vocals = est_sources.shape[2]
+    speakers = 0
+    for i in range(num_vocals):
+        save_file = f'{output_folder}/{file}_speaker{i:003d}.wav'
+        torchaudio.save(save_file, est_sources[:, :, i].detach().cpu(), SAMPLE_RATE)
+        speakers += 1
+    # Write number of speakers in a txt file
+    write_number_speakers_txt(output_folder, speakers)
+def separate_vocals_modelscope(input_audio_file_path, output_folder, model):
     # Get input and output names
+    input_name, _ = input_audio_file_path.split(".")
+    input_folder, input_name = input_name.split("/")
     # Set input files with 8k sample rate and mono
+    input_8k = f"{input_folder}/{input_name}_8k.wav"
+    input_8k_mono = f"{input_folder}/{input_name}_8k_mono.wav"
     # Check if input has 8k sample rate, if not, change it
+    sr = get_sample_rate(input_audio_file_path)
     if sr != SAMPLE_RATE:
         change_sample_rate(input, input_8k, SAMPLE_RATE)
         remove_8k = True
         remove_mono = False
     # Separate audio voices
+    result = model(input_8k_mono)
     # Save separated audio voices
+    speakers = 0
     for i, signal in enumerate(result['output_pcm_list']):
+        save_file = f'{output_folder}/{input_name}_speaker{i:003d}.wav'
         sf.write(save_file, np.frombuffer(signal, dtype=np.int16), SAMPLE_RATE)
+        speakers += 1
+    # Write number of speakers in a txt file
+    write_number_speakers_txt(output_folder, speakers)
     # Remove temporary files
     if remove_8k:
 if __name__ == '__main__':
     argparser = argparse.ArgumentParser(description='Separate speech from a stereo audio file')
+    argparser.add_argument('inputs_file', type=str, help='File with the list of inputs')
+    argparser.add_argument('device', type=str, help='Device to use for separation')
     args = argparser.parse_args()
+    device = args.device
+    if MODEL == MODEL_SPEECHBRAIN:
+        if device == 'cpu':
+            model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr')
+        elif 'cuda' in device:
+            model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr', run_opts={"device":f"{device}"})
+        elif device == 'gpu':
+            model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr', run_opts={"device":"cuda"})
+        else:
+            raise ValueError(f"Device {device} is not valid")
+    elif MODEL == MODEL_MODELSCOPE:
+        separation = pipeline(Tasks.speech_separation, model='damo/speech_mossformer_separation_temporal_8k', device=device)
+    else:
+        raise ValueError(f"Model {MODEL} is not valid")
+    # Read files from input file
+    with open(args.inputs_file, 'r') as f:
+        inputs = f.read().splitlines()
+    output_folder = "vocals"
+    for input in inputs:
+        if MODEL == MODEL_SPEECHBRAIN:
+            separate_vocals_speechbrain(input, output_folder, model)
+        elif MODEL == MODEL_MODELSCOPE:
+            separate_vocals_modelscope(input, output_folder, separation)
+        else:
+            raise ValueError(f"Model {MODEL} is not valid")