Into separe_vocals.py set modelscope and speechbrain methods
Browse files- separe_vocals.py +75 -21
separe_vocals.py
CHANGED
@@ -5,8 +5,16 @@ import numpy as np
|
|
5 |
import os
|
6 |
import torch
|
7 |
import argparse
|
|
|
|
|
|
|
|
|
8 |
|
9 |
SAMPLE_RATE = 8000
|
|
|
|
|
|
|
|
|
10 |
|
11 |
def get_sample_rate(audio_file_path):
|
12 |
"""
|
@@ -51,27 +59,42 @@ def set_mono(input_audio_file_path, output_audio_file_path):
|
|
51 |
"""
|
52 |
os.system(f'ffmpeg -i {input_audio_file_path} -ac 1 -loglevel error {output_audio_file_path}')
|
53 |
|
54 |
-
def
|
55 |
-
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
|
|
59 |
# Get input and output names
|
60 |
-
input_name =
|
61 |
-
|
62 |
-
|
63 |
-
# Get folder of output file
|
64 |
-
input_folder = input_name.split("/")[0]
|
65 |
-
output_folder = "vocals"
|
66 |
-
input_file_name = input_name.split("/")[1]
|
67 |
-
output_file_name = output_name.split("/")[1]
|
68 |
|
69 |
# Set input files with 8k sample rate and mono
|
70 |
-
input_8k = f"{input_name}_8k.wav"
|
71 |
-
input_8k_mono = f"{input_name}_8k_mono.wav"
|
72 |
|
73 |
# Check if input has 8k sample rate, if not, change it
|
74 |
-
sr = get_sample_rate(
|
75 |
if sr != SAMPLE_RATE:
|
76 |
change_sample_rate(input, input_8k, SAMPLE_RATE)
|
77 |
remove_8k = True
|
@@ -88,14 +111,17 @@ def main(args):
|
|
88 |
remove_mono = False
|
89 |
|
90 |
# Separate audio voices
|
91 |
-
|
92 |
-
separation = pipeline(Tasks.speech_separation, model='damo/speech_mossformer_separation_temporal_8k', device=device)
|
93 |
-
result = separation(input_8k_mono)
|
94 |
|
95 |
# Save separated audio voices
|
|
|
96 |
for i, signal in enumerate(result['output_pcm_list']):
|
97 |
-
save_file = f'{output_folder}/{
|
98 |
sf.write(save_file, np.frombuffer(signal, dtype=np.int16), SAMPLE_RATE)
|
|
|
|
|
|
|
|
|
99 |
|
100 |
# Remove temporary files
|
101 |
if remove_8k:
|
@@ -105,6 +131,34 @@ def main(args):
|
|
105 |
|
106 |
if __name__ == '__main__':
|
107 |
argparser = argparse.ArgumentParser(description='Separate speech from a stereo audio file')
|
108 |
-
argparser.add_argument('
|
|
|
109 |
args = argparser.parse_args()
|
110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
import os
|
6 |
import torch
|
7 |
import argparse
|
8 |
+
import speechbrain as sb
|
9 |
+
from speechbrain.dataio.dataio import read_audio
|
10 |
+
from speechbrain.pretrained import SepformerSeparation as separator
|
11 |
+
import torchaudio
|
12 |
|
13 |
SAMPLE_RATE = 8000
|
14 |
+
MODEL_SPEECHBRAIN = "SPEECHBRAIN"
|
15 |
+
MODEL_MODELSCOPE = "MODELSCOPE"
|
16 |
+
# MODEL = MODEL_SPEECHBRAIN
|
17 |
+
MODEL = MODEL_MODELSCOPE
|
18 |
|
19 |
def get_sample_rate(audio_file_path):
|
20 |
"""
|
|
|
59 |
"""
|
60 |
os.system(f'ffmpeg -i {input_audio_file_path} -ac 1 -loglevel error {output_audio_file_path}')
|
61 |
|
62 |
+
def write_number_speakers_txt(output_folder, num_speakers):
|
63 |
+
"""
|
64 |
+
Write the number of speakers in a txt file
|
65 |
+
Args:
|
66 |
+
output_folder (str): Path to the output folder
|
67 |
+
num_speakers (int): Number of speakers
|
68 |
+
"""
|
69 |
+
with open(f"{output_folder}/speakers.txt", 'w') as f:
|
70 |
+
f.write(str(num_speakers))
|
71 |
+
|
72 |
+
def separate_vocals_speechbrain(input_audio_file_path, output_folder, model):
|
73 |
+
file, _ = input_audio_file_path.split(".")
|
74 |
+
_, file = file.split("/")
|
75 |
+
|
76 |
+
est_sources = model.separate_file(path=input_audio_file_path)
|
77 |
+
num_vocals = est_sources.shape[2]
|
78 |
+
speakers = 0
|
79 |
+
for i in range(num_vocals):
|
80 |
+
save_file = f'{output_folder}/{file}_speaker{i:003d}.wav'
|
81 |
+
torchaudio.save(save_file, est_sources[:, :, i].detach().cpu(), SAMPLE_RATE)
|
82 |
+
speakers += 1
|
83 |
+
|
84 |
+
# Write number of speakers in a txt file
|
85 |
+
write_number_speakers_txt(output_folder, speakers)
|
86 |
|
87 |
+
def separate_vocals_modelscope(input_audio_file_path, output_folder, model):
|
88 |
# Get input and output names
|
89 |
+
input_name, _ = input_audio_file_path.split(".")
|
90 |
+
input_folder, input_name = input_name.split("/")
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
|
92 |
# Set input files with 8k sample rate and mono
|
93 |
+
input_8k = f"{input_folder}/{input_name}_8k.wav"
|
94 |
+
input_8k_mono = f"{input_folder}/{input_name}_8k_mono.wav"
|
95 |
|
96 |
# Check if input has 8k sample rate, if not, change it
|
97 |
+
sr = get_sample_rate(input_audio_file_path)
|
98 |
if sr != SAMPLE_RATE:
|
99 |
change_sample_rate(input, input_8k, SAMPLE_RATE)
|
100 |
remove_8k = True
|
|
|
111 |
remove_mono = False
|
112 |
|
113 |
# Separate audio voices
|
114 |
+
result = model(input_8k_mono)
|
|
|
|
|
115 |
|
116 |
# Save separated audio voices
|
117 |
+
speakers = 0
|
118 |
for i, signal in enumerate(result['output_pcm_list']):
|
119 |
+
save_file = f'{output_folder}/{input_name}_speaker{i:003d}.wav'
|
120 |
sf.write(save_file, np.frombuffer(signal, dtype=np.int16), SAMPLE_RATE)
|
121 |
+
speakers += 1
|
122 |
+
|
123 |
+
# Write number of speakers in a txt file
|
124 |
+
write_number_speakers_txt(output_folder, speakers)
|
125 |
|
126 |
# Remove temporary files
|
127 |
if remove_8k:
|
|
|
131 |
|
132 |
if __name__ == '__main__':
|
133 |
argparser = argparse.ArgumentParser(description='Separate speech from a stereo audio file')
|
134 |
+
argparser.add_argument('inputs_file', type=str, help='File with the list of inputs')
|
135 |
+
argparser.add_argument('device', type=str, help='Device to use for separation')
|
136 |
args = argparser.parse_args()
|
137 |
+
|
138 |
+
device = args.device
|
139 |
+
if MODEL == MODEL_SPEECHBRAIN:
|
140 |
+
if device == 'cpu':
|
141 |
+
model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr')
|
142 |
+
elif 'cuda' in device:
|
143 |
+
model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr', run_opts={"device":f"{device}"})
|
144 |
+
elif device == 'gpu':
|
145 |
+
model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr', run_opts={"device":"cuda"})
|
146 |
+
else:
|
147 |
+
raise ValueError(f"Device {device} is not valid")
|
148 |
+
elif MODEL == MODEL_MODELSCOPE:
|
149 |
+
separation = pipeline(Tasks.speech_separation, model='damo/speech_mossformer_separation_temporal_8k', device=device)
|
150 |
+
else:
|
151 |
+
raise ValueError(f"Model {MODEL} is not valid")
|
152 |
+
|
153 |
+
# Read files from input file
|
154 |
+
with open(args.inputs_file, 'r') as f:
|
155 |
+
inputs = f.read().splitlines()
|
156 |
+
|
157 |
+
output_folder = "vocals"
|
158 |
+
for input in inputs:
|
159 |
+
if MODEL == MODEL_SPEECHBRAIN:
|
160 |
+
separate_vocals_speechbrain(input, output_folder, model)
|
161 |
+
elif MODEL == MODEL_MODELSCOPE:
|
162 |
+
separate_vocals_modelscope(input, output_folder, separation)
|
163 |
+
else:
|
164 |
+
raise ValueError(f"Model {MODEL} is not valid")
|