File size: 5,979 Bytes
6e9a4ca
 
 
 
 
 
 
1215bc9
 
 
 
6e9a4ca
 
1215bc9
 
 
 
6e9a4ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6f76ded
6e9a4ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6f76ded
6e9a4ca
1215bc9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6e9a4ca
1215bc9
6e9a4ca
1215bc9
 
9af6e91
6e9a4ca
1215bc9
 
6e9a4ca
 
1215bc9
6e9a4ca
 
4a317df
6e9a4ca
 
4a317df
6e9a4ca
 
 
 
4a317df
6e9a4ca
 
4a317df
6e9a4ca
 
1215bc9
6e9a4ca
 
1215bc9
6e9a4ca
1215bc9
6e9a4ca
1215bc9
 
 
 
9af6e91
 
4a317df
 
 
 
6e9a4ca
 
 
1215bc9
 
6e9a4ca
1215bc9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
import soundfile as sf
import numpy as np
import os
import torch
import argparse
import speechbrain as sb
from speechbrain.dataio.dataio import read_audio
from speechbrain.pretrained import SepformerSeparation as separator
import torchaudio

SAMPLE_RATE = 8000
MODEL_SPEECHBRAIN = "SPEECHBRAIN"
MODEL_MODELSCOPE = "MODELSCOPE"
# MODEL = MODEL_SPEECHBRAIN
MODEL = MODEL_MODELSCOPE

def get_sample_rate(audio_file_path):
    """
    Get the sample rate of an audio file
    Args:
        audio_file_path (str): Path to the audio file

    Returns:
        int: Sample rate of the audio file
    """
    _, sample_rate = sf.read(audio_file_path, always_2d=True)
    return sample_rate

def change_sample_rate(input_audio_file_path, output_audio_file_path, sample_rate):
    """
    Change the sample rate of an audio file
    Args:
        input_audio_file_path (str): Path to the input audio file
        output_audio_file_path (str): Path to the output audio file
        sample_rate (int): Sample rate to change to
    """
    os.system(f'ffmpeg -i {input_audio_file_path} -ar {sample_rate} -loglevel error {output_audio_file_path}')

def audio_is_stereo(audio_file_path):
    """
    Check if an audio file is stereo
    Args:
        audio_file_path (str): Path to the audio file

    Returns:
        bool: True if the audio file is stereo, False otherwise
    """
    audio, _ = sf.read(audio_file_path, always_2d=True)
    return audio.shape[1] == 2

def set_mono(input_audio_file_path, output_audio_file_path):
    """
    Set an audio file to mono
    Args:
        input_audio_file_path (str): Path to the input audio file
        output_audio_file_path (str): Path to the output audio file
    """
    os.system(f'ffmpeg -i {input_audio_file_path} -ac 1 -loglevel error {output_audio_file_path}')

def write_number_speakers_txt(output_folder, num_speakers):
    """
    Write the number of speakers in a txt file
    Args:
        output_folder (str): Path to the output folder
        num_speakers (int): Number of speakers
    """
    with open(f"{output_folder}/speakers.txt", 'w') as f:
        f.write(str(num_speakers))

def separate_vocals_speechbrain(input_audio_file_path, output_folder, model):
    file, _ = input_audio_file_path.split(".")
    _, file = file.split("/")

    est_sources = model.separate_file(path=input_audio_file_path)
    num_vocals = est_sources.shape[2]
    speakers = 0
    for i in range(num_vocals):
        save_file = f'{output_folder}/{file}_speaker{i:003d}.wav'
        torchaudio.save(save_file, est_sources[:, :, i].detach().cpu(), SAMPLE_RATE)
        speakers += 1
    
    # Write number of speakers in a txt file
    write_number_speakers_txt(output_folder, speakers)

def separate_vocals_modelscope(input_audio_file_path, output_folder, model):
    # Get input and output names
    input_name, _ = input_audio_file_path.split(".")
    input_folder, input_name = input_name.split("/")

    # Set input files with 8k sample rate and mono
    input_8k = f"{input_folder}/{input_name}_8k.wav"
    input_8k_mono = f"{input_folder}/{input_name}_8k_mono.wav"

    # Check if input has 8k sample rate, if not, change it
    sr = get_sample_rate(input_audio_file_path)
    if sr != SAMPLE_RATE:
        change_sample_rate(input, input_8k, SAMPLE_RATE)
        remove_8k = True
    else:
        input_8k = input
        remove_8k = False

    # Check if input is stereo, if yes, set it to mono
    if audio_is_stereo(input_8k):
        set_mono(input_8k, input_8k_mono)
        remove_mono = True
    else:
        input_8k_mono = input_8k
        remove_mono = False

    # Separate audio voices
    result = model(input_8k_mono)

    # Save separated audio voices
    speakers = 0
    for i, signal in enumerate(result['output_pcm_list']):
        save_file = f'{output_folder}/{input_name}_speaker{i:003d}.wav'
        sf.write(save_file, np.frombuffer(signal, dtype=np.int16), SAMPLE_RATE)
        speakers += 1
    
    # Write number of speakers in a txt file
    write_number_speakers_txt(output_folder, speakers)
    
    # Remove temporary files
    if remove_8k:
        os.remove(input_8k)
    if remove_mono:
        os.remove(input_8k_mono)

if __name__ == '__main__':
    argparser = argparse.ArgumentParser(description='Separate speech from a stereo audio file')
    argparser.add_argument('inputs_file', type=str, help='File with the list of inputs')
    argparser.add_argument('device', type=str, help='Device to use for separation')
    args = argparser.parse_args()

    device = args.device
    if MODEL == MODEL_SPEECHBRAIN:
        if device == 'cpu':
            model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr')
        elif 'cuda' in device:
            model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr', run_opts={"device":f"{device}"})
        elif device == 'gpu':
            model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr', run_opts={"device":"cuda"})
        else:
            raise ValueError(f"Device {device} is not valid")
    elif MODEL == MODEL_MODELSCOPE:
        separation = pipeline(Tasks.speech_separation, model='damo/speech_mossformer_separation_temporal_8k', device=device)
    else:
        raise ValueError(f"Model {MODEL} is not valid")

    # Read files from input file
    with open(args.inputs_file, 'r') as f:
        inputs = f.read().splitlines()
    
    output_folder = "vocals"
    for input in inputs:
        if MODEL == MODEL_SPEECHBRAIN:
            separate_vocals_speechbrain(input, output_folder, model)
        elif MODEL == MODEL_MODELSCOPE:
            separate_vocals_modelscope(input, output_folder, separation)
        else:
            raise ValueError(f"Model {MODEL} is not valid")