Maximofn commited on
Commit
1215bc9
1 Parent(s): 6d86b70

Into separe_vocals.py set modelscope and speechbrain methods

Browse files
Files changed (1) hide show
  1. separe_vocals.py +75 -21
separe_vocals.py CHANGED
@@ -5,8 +5,16 @@ import numpy as np
5
  import os
6
  import torch
7
  import argparse
 
 
 
 
8
 
9
  SAMPLE_RATE = 8000
 
 
 
 
10
 
11
  def get_sample_rate(audio_file_path):
12
  """
@@ -51,27 +59,42 @@ def set_mono(input_audio_file_path, output_audio_file_path):
51
  """
52
  os.system(f'ffmpeg -i {input_audio_file_path} -ac 1 -loglevel error {output_audio_file_path}')
53
 
54
- def main(args):
55
- # Get input and output files
56
- input = args.input
57
- output = args.input
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
 
59
  # Get input and output names
60
- input_name = input.split(".")[0]
61
- output_name = output.split(".")[0]
62
-
63
- # Get folder of output file
64
- input_folder = input_name.split("/")[0]
65
- output_folder = "vocals"
66
- input_file_name = input_name.split("/")[1]
67
- output_file_name = output_name.split("/")[1]
68
 
69
  # Set input files with 8k sample rate and mono
70
- input_8k = f"{input_name}_8k.wav"
71
- input_8k_mono = f"{input_name}_8k_mono.wav"
72
 
73
  # Check if input has 8k sample rate, if not, change it
74
- sr = get_sample_rate(input)
75
  if sr != SAMPLE_RATE:
76
  change_sample_rate(input, input_8k, SAMPLE_RATE)
77
  remove_8k = True
@@ -88,14 +111,17 @@ def main(args):
88
  remove_mono = False
89
 
90
  # Separate audio voices
91
- device = 'cuda' if torch.cuda.is_available() else 'cpu'
92
- separation = pipeline(Tasks.speech_separation, model='damo/speech_mossformer_separation_temporal_8k', device=device)
93
- result = separation(input_8k_mono)
94
 
95
  # Save separated audio voices
 
96
  for i, signal in enumerate(result['output_pcm_list']):
97
- save_file = f'{output_folder}/{output_file_name}_speaker{i:003d}.wav'
98
  sf.write(save_file, np.frombuffer(signal, dtype=np.int16), SAMPLE_RATE)
 
 
 
 
99
 
100
  # Remove temporary files
101
  if remove_8k:
@@ -105,6 +131,34 @@ def main(args):
105
 
106
  if __name__ == '__main__':
107
  argparser = argparse.ArgumentParser(description='Separate speech from a stereo audio file')
108
- argparser.add_argument('input', type=str, help='Input audio file')
 
109
  args = argparser.parse_args()
110
- main(args)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  import os
6
  import torch
7
  import argparse
8
+ import speechbrain as sb
9
+ from speechbrain.dataio.dataio import read_audio
10
+ from speechbrain.pretrained import SepformerSeparation as separator
11
+ import torchaudio
12
 
13
  SAMPLE_RATE = 8000
14
+ MODEL_SPEECHBRAIN = "SPEECHBRAIN"
15
+ MODEL_MODELSCOPE = "MODELSCOPE"
16
+ # MODEL = MODEL_SPEECHBRAIN
17
+ MODEL = MODEL_MODELSCOPE
18
 
19
  def get_sample_rate(audio_file_path):
20
  """
 
59
  """
60
  os.system(f'ffmpeg -i {input_audio_file_path} -ac 1 -loglevel error {output_audio_file_path}')
61
 
62
+ def write_number_speakers_txt(output_folder, num_speakers):
63
+ """
64
+ Write the number of speakers in a txt file
65
+ Args:
66
+ output_folder (str): Path to the output folder
67
+ num_speakers (int): Number of speakers
68
+ """
69
+ with open(f"{output_folder}/speakers.txt", 'w') as f:
70
+ f.write(str(num_speakers))
71
+
72
+ def separate_vocals_speechbrain(input_audio_file_path, output_folder, model):
73
+ file, _ = input_audio_file_path.split(".")
74
+ _, file = file.split("/")
75
+
76
+ est_sources = model.separate_file(path=input_audio_file_path)
77
+ num_vocals = est_sources.shape[2]
78
+ speakers = 0
79
+ for i in range(num_vocals):
80
+ save_file = f'{output_folder}/{file}_speaker{i:003d}.wav'
81
+ torchaudio.save(save_file, est_sources[:, :, i].detach().cpu(), SAMPLE_RATE)
82
+ speakers += 1
83
+
84
+ # Write number of speakers in a txt file
85
+ write_number_speakers_txt(output_folder, speakers)
86
 
87
+ def separate_vocals_modelscope(input_audio_file_path, output_folder, model):
88
  # Get input and output names
89
+ input_name, _ = input_audio_file_path.split(".")
90
+ input_folder, input_name = input_name.split("/")
 
 
 
 
 
 
91
 
92
  # Set input files with 8k sample rate and mono
93
+ input_8k = f"{input_folder}/{input_name}_8k.wav"
94
+ input_8k_mono = f"{input_folder}/{input_name}_8k_mono.wav"
95
 
96
  # Check if input has 8k sample rate, if not, change it
97
+ sr = get_sample_rate(input_audio_file_path)
98
  if sr != SAMPLE_RATE:
99
  change_sample_rate(input, input_8k, SAMPLE_RATE)
100
  remove_8k = True
 
111
  remove_mono = False
112
 
113
  # Separate audio voices
114
+ result = model(input_8k_mono)
 
 
115
 
116
  # Save separated audio voices
117
+ speakers = 0
118
  for i, signal in enumerate(result['output_pcm_list']):
119
+ save_file = f'{output_folder}/{input_name}_speaker{i:003d}.wav'
120
  sf.write(save_file, np.frombuffer(signal, dtype=np.int16), SAMPLE_RATE)
121
+ speakers += 1
122
+
123
+ # Write number of speakers in a txt file
124
+ write_number_speakers_txt(output_folder, speakers)
125
 
126
  # Remove temporary files
127
  if remove_8k:
 
131
 
132
  if __name__ == '__main__':
133
  argparser = argparse.ArgumentParser(description='Separate speech from a stereo audio file')
134
+ argparser.add_argument('inputs_file', type=str, help='File with the list of inputs')
135
+ argparser.add_argument('device', type=str, help='Device to use for separation')
136
  args = argparser.parse_args()
137
+
138
+ device = args.device
139
+ if MODEL == MODEL_SPEECHBRAIN:
140
+ if device == 'cpu':
141
+ model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr')
142
+ elif 'cuda' in device:
143
+ model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr', run_opts={"device":f"{device}"})
144
+ elif device == 'gpu':
145
+ model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr', run_opts={"device":"cuda"})
146
+ else:
147
+ raise ValueError(f"Device {device} is not valid")
148
+ elif MODEL == MODEL_MODELSCOPE:
149
+ separation = pipeline(Tasks.speech_separation, model='damo/speech_mossformer_separation_temporal_8k', device=device)
150
+ else:
151
+ raise ValueError(f"Model {MODEL} is not valid")
152
+
153
+ # Read files from input file
154
+ with open(args.inputs_file, 'r') as f:
155
+ inputs = f.read().splitlines()
156
+
157
+ output_folder = "vocals"
158
+ for input in inputs:
159
+ if MODEL == MODEL_SPEECHBRAIN:
160
+ separate_vocals_speechbrain(input, output_folder, model)
161
+ elif MODEL == MODEL_MODELSCOPE:
162
+ separate_vocals_modelscope(input, output_folder, separation)
163
+ else:
164
+ raise ValueError(f"Model {MODEL} is not valid")