Create script to separate voices into input audio file
Browse files
separe.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from modelscope.pipelines import pipeline
|
2 |
+
from modelscope.utils.constant import Tasks
|
3 |
+
import soundfile as sf
|
4 |
+
import numpy as np
|
5 |
+
import os
|
6 |
+
import torch
|
7 |
+
import argparse
|
8 |
+
|
9 |
+
SAMPLE_RATE = 8000
|
10 |
+
|
11 |
+
def get_sample_rate(audio_file_path):
|
12 |
+
"""
|
13 |
+
Get the sample rate of an audio file
|
14 |
+
Args:
|
15 |
+
audio_file_path (str): Path to the audio file
|
16 |
+
|
17 |
+
Returns:
|
18 |
+
int: Sample rate of the audio file
|
19 |
+
"""
|
20 |
+
_, sample_rate = sf.read(audio_file_path, always_2d=True)
|
21 |
+
return sample_rate
|
22 |
+
|
23 |
+
def change_sample_rate(input_audio_file_path, output_audio_file_path, sample_rate):
|
24 |
+
"""
|
25 |
+
Change the sample rate of an audio file
|
26 |
+
Args:
|
27 |
+
input_audio_file_path (str): Path to the input audio file
|
28 |
+
output_audio_file_path (str): Path to the output audio file
|
29 |
+
sample_rate (int): Sample rate to change to
|
30 |
+
"""
|
31 |
+
os.system(f'ffmpeg -i {input_audio_file_path} -ar {sample_rate} {output_audio_file_path}')
|
32 |
+
|
33 |
+
def audio_is_stereo(audio_file_path):
|
34 |
+
"""
|
35 |
+
Check if an audio file is stereo
|
36 |
+
Args:
|
37 |
+
audio_file_path (str): Path to the audio file
|
38 |
+
|
39 |
+
Returns:
|
40 |
+
bool: True if the audio file is stereo, False otherwise
|
41 |
+
"""
|
42 |
+
audio, _ = sf.read(audio_file_path, always_2d=True)
|
43 |
+
return audio.shape[1] == 2
|
44 |
+
|
45 |
+
def set_mono(input_audio_file_path, output_audio_file_path):
|
46 |
+
"""
|
47 |
+
Set an audio file to mono
|
48 |
+
Args:
|
49 |
+
input_audio_file_path (str): Path to the input audio file
|
50 |
+
output_audio_file_path (str): Path to the output audio file
|
51 |
+
"""
|
52 |
+
os.system(f'ffmpeg -i {input_audio_file_path} -ac 1 {output_audio_file_path}')
|
53 |
+
|
54 |
+
def main(args):
|
55 |
+
# Get input and output files
|
56 |
+
input = args.input
|
57 |
+
output = args.output
|
58 |
+
|
59 |
+
# Get input and output names
|
60 |
+
input_name = input.split(".")[0]
|
61 |
+
output_name = output.split(".")[0]
|
62 |
+
|
63 |
+
# Set input files with 8k sample rate and mono
|
64 |
+
input_8k = f"{input_name}_8k.wav"
|
65 |
+
input_8k_mono = f"{input_name}_8k_mono.wav"
|
66 |
+
|
67 |
+
# Check if input has 8k sample rate, if not, change it
|
68 |
+
sr = get_sample_rate(input)
|
69 |
+
if sr != SAMPLE_RATE:
|
70 |
+
print("Changing sample rate...")
|
71 |
+
change_sample_rate(input, input_8k, SAMPLE_RATE)
|
72 |
+
else:
|
73 |
+
input_8k = input
|
74 |
+
|
75 |
+
# Check if input is stereo, if yes, set it to mono
|
76 |
+
if audio_is_stereo(input_8k):
|
77 |
+
print("Setting mono...")
|
78 |
+
set_mono(input_8k, input_8k_mono)
|
79 |
+
else:
|
80 |
+
input_8k_mono = input_8k
|
81 |
+
|
82 |
+
# Separate audio voices
|
83 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
84 |
+
separation = pipeline(Tasks.speech_separation, model='damo/speech_mossformer_separation_temporal_8k', device=device)
|
85 |
+
print("Separating...")
|
86 |
+
result = separation(input_8k_mono)
|
87 |
+
print("Separated!")
|
88 |
+
|
89 |
+
# Save separated audio voices
|
90 |
+
print("Saving...")
|
91 |
+
for i, signal in enumerate(result['output_pcm_list']):
|
92 |
+
save_file = f'{output_name}_spk{i}.wav'
|
93 |
+
sf.write(save_file, np.frombuffer(signal, dtype=np.int16), SAMPLE_RATE)
|
94 |
+
print("Saved!")
|
95 |
+
|
96 |
+
if __name__ == '__main__':
|
97 |
+
argparser = argparse.ArgumentParser(description='Separate speech from a stereo audio file')
|
98 |
+
argparser.add_argument('input', type=str, help='Input audio file')
|
99 |
+
argparser.add_argument('output', type=str, help='Output directory')
|
100 |
+
args = argparser.parse_args()
|
101 |
+
main(args)
|