Maximofn commited on
Commit
6e9a4ca
1 Parent(s): 5df18dc

Create script to separate voices into input audio file

Browse files
Files changed (1) hide show
  1. separe.py +101 -0
separe.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from modelscope.pipelines import pipeline
2
+ from modelscope.utils.constant import Tasks
3
+ import soundfile as sf
4
+ import numpy as np
5
+ import os
6
+ import torch
7
+ import argparse
8
+
9
+ SAMPLE_RATE = 8000
10
+
11
+ def get_sample_rate(audio_file_path):
12
+ """
13
+ Get the sample rate of an audio file
14
+ Args:
15
+ audio_file_path (str): Path to the audio file
16
+
17
+ Returns:
18
+ int: Sample rate of the audio file
19
+ """
20
+ _, sample_rate = sf.read(audio_file_path, always_2d=True)
21
+ return sample_rate
22
+
23
+ def change_sample_rate(input_audio_file_path, output_audio_file_path, sample_rate):
24
+ """
25
+ Change the sample rate of an audio file
26
+ Args:
27
+ input_audio_file_path (str): Path to the input audio file
28
+ output_audio_file_path (str): Path to the output audio file
29
+ sample_rate (int): Sample rate to change to
30
+ """
31
+ os.system(f'ffmpeg -i {input_audio_file_path} -ar {sample_rate} {output_audio_file_path}')
32
+
33
+ def audio_is_stereo(audio_file_path):
34
+ """
35
+ Check if an audio file is stereo
36
+ Args:
37
+ audio_file_path (str): Path to the audio file
38
+
39
+ Returns:
40
+ bool: True if the audio file is stereo, False otherwise
41
+ """
42
+ audio, _ = sf.read(audio_file_path, always_2d=True)
43
+ return audio.shape[1] == 2
44
+
45
+ def set_mono(input_audio_file_path, output_audio_file_path):
46
+ """
47
+ Set an audio file to mono
48
+ Args:
49
+ input_audio_file_path (str): Path to the input audio file
50
+ output_audio_file_path (str): Path to the output audio file
51
+ """
52
+ os.system(f'ffmpeg -i {input_audio_file_path} -ac 1 {output_audio_file_path}')
53
+
54
+ def main(args):
55
+ # Get input and output files
56
+ input = args.input
57
+ output = args.output
58
+
59
+ # Get input and output names
60
+ input_name = input.split(".")[0]
61
+ output_name = output.split(".")[0]
62
+
63
+ # Set input files with 8k sample rate and mono
64
+ input_8k = f"{input_name}_8k.wav"
65
+ input_8k_mono = f"{input_name}_8k_mono.wav"
66
+
67
+ # Check if input has 8k sample rate, if not, change it
68
+ sr = get_sample_rate(input)
69
+ if sr != SAMPLE_RATE:
70
+ print("Changing sample rate...")
71
+ change_sample_rate(input, input_8k, SAMPLE_RATE)
72
+ else:
73
+ input_8k = input
74
+
75
+ # Check if input is stereo, if yes, set it to mono
76
+ if audio_is_stereo(input_8k):
77
+ print("Setting mono...")
78
+ set_mono(input_8k, input_8k_mono)
79
+ else:
80
+ input_8k_mono = input_8k
81
+
82
+ # Separate audio voices
83
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
84
+ separation = pipeline(Tasks.speech_separation, model='damo/speech_mossformer_separation_temporal_8k', device=device)
85
+ print("Separating...")
86
+ result = separation(input_8k_mono)
87
+ print("Separated!")
88
+
89
+ # Save separated audio voices
90
+ print("Saving...")
91
+ for i, signal in enumerate(result['output_pcm_list']):
92
+ save_file = f'{output_name}_spk{i}.wav'
93
+ sf.write(save_file, np.frombuffer(signal, dtype=np.int16), SAMPLE_RATE)
94
+ print("Saved!")
95
+
96
+ if __name__ == '__main__':
97
+ argparser = argparse.ArgumentParser(description='Separate speech from a stereo audio file')
98
+ argparser.add_argument('input', type=str, help='Input audio file')
99
+ argparser.add_argument('output', type=str, help='Output directory')
100
+ args = argparser.parse_args()
101
+ main(args)