Spaces:

Maximofn
/

subtify

Running

App Files Files Community

subtify / separe_vocals.py

Maximofn

Into separe_vocals.py set modelscope and speechbrain methods

1215bc9 9 months ago

raw

history blame contribute delete

No virus

5.98 kB

	from modelscope.pipelines import pipeline
	from modelscope.utils.constant import Tasks
	import soundfile as sf
	import numpy as np
	import os
	import torch
	import argparse
	import speechbrain as sb
	from speechbrain.dataio.dataio import read_audio
	from speechbrain.pretrained import SepformerSeparation as separator
	import torchaudio

	SAMPLE_RATE = 8000
	MODEL_SPEECHBRAIN = "SPEECHBRAIN"
	MODEL_MODELSCOPE = "MODELSCOPE"
	# MODEL = MODEL_SPEECHBRAIN
	MODEL = MODEL_MODELSCOPE

	def get_sample_rate(audio_file_path):
	"""
	Get the sample rate of an audio file
	Args:
	audio_file_path (str): Path to the audio file

	Returns:
	int: Sample rate of the audio file
	"""
	_, sample_rate = sf.read(audio_file_path, always_2d=True)
	return sample_rate

	def change_sample_rate(input_audio_file_path, output_audio_file_path, sample_rate):
	"""
	Change the sample rate of an audio file
	Args:
	input_audio_file_path (str): Path to the input audio file
	output_audio_file_path (str): Path to the output audio file
	sample_rate (int): Sample rate to change to
	"""
	os.system(f'ffmpeg -i {input_audio_file_path} -ar {sample_rate} -loglevel error {output_audio_file_path}')

	def audio_is_stereo(audio_file_path):
	"""
	Check if an audio file is stereo
	Args:
	audio_file_path (str): Path to the audio file

	Returns:
	bool: True if the audio file is stereo, False otherwise
	"""
	audio, _ = sf.read(audio_file_path, always_2d=True)
	return audio.shape[1] == 2

	def set_mono(input_audio_file_path, output_audio_file_path):
	"""
	Set an audio file to mono
	Args:
	input_audio_file_path (str): Path to the input audio file
	output_audio_file_path (str): Path to the output audio file
	"""
	os.system(f'ffmpeg -i {input_audio_file_path} -ac 1 -loglevel error {output_audio_file_path}')

	def write_number_speakers_txt(output_folder, num_speakers):
	"""
	Write the number of speakers in a txt file
	Args:
	output_folder (str): Path to the output folder
	num_speakers (int): Number of speakers
	"""
	with open(f"{output_folder}/speakers.txt", 'w') as f:
	f.write(str(num_speakers))

	def separate_vocals_speechbrain(input_audio_file_path, output_folder, model):
	file, _ = input_audio_file_path.split(".")
	_, file = file.split("/")

	est_sources = model.separate_file(path=input_audio_file_path)
	num_vocals = est_sources.shape[2]
	speakers = 0
	for i in range(num_vocals):
	save_file = f'{output_folder}/{file}_speaker{i:003d}.wav'
	torchaudio.save(save_file, est_sources[:, :, i].detach().cpu(), SAMPLE_RATE)
	speakers += 1

	# Write number of speakers in a txt file
	write_number_speakers_txt(output_folder, speakers)

	def separate_vocals_modelscope(input_audio_file_path, output_folder, model):
	# Get input and output names
	input_name, _ = input_audio_file_path.split(".")
	input_folder, input_name = input_name.split("/")

	# Set input files with 8k sample rate and mono
	input_8k = f"{input_folder}/{input_name}_8k.wav"
	input_8k_mono = f"{input_folder}/{input_name}_8k_mono.wav"

	# Check if input has 8k sample rate, if not, change it
	sr = get_sample_rate(input_audio_file_path)
	if sr != SAMPLE_RATE:
	change_sample_rate(input, input_8k, SAMPLE_RATE)
	remove_8k = True
	else:
	input_8k = input
	remove_8k = False

	# Check if input is stereo, if yes, set it to mono
	if audio_is_stereo(input_8k):
	set_mono(input_8k, input_8k_mono)
	remove_mono = True
	else:
	input_8k_mono = input_8k
	remove_mono = False

	# Separate audio voices
	result = model(input_8k_mono)

	# Save separated audio voices
	speakers = 0
	for i, signal in enumerate(result['output_pcm_list']):
	save_file = f'{output_folder}/{input_name}_speaker{i:003d}.wav'
	sf.write(save_file, np.frombuffer(signal, dtype=np.int16), SAMPLE_RATE)
	speakers += 1

	# Write number of speakers in a txt file
	write_number_speakers_txt(output_folder, speakers)

	# Remove temporary files
	if remove_8k:
	os.remove(input_8k)
	if remove_mono:
	os.remove(input_8k_mono)

	if __name__ == '__main__':
	argparser = argparse.ArgumentParser(description='Separate speech from a stereo audio file')
	argparser.add_argument('inputs_file', type=str, help='File with the list of inputs')
	argparser.add_argument('device', type=str, help='Device to use for separation')
	args = argparser.parse_args()

	device = args.device
	if MODEL == MODEL_SPEECHBRAIN:
	if device == 'cpu':
	model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr')
	elif 'cuda' in device:
	model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr', run_opts={"device":f"{device}"})
	elif device == 'gpu':
	model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr', run_opts={"device":"cuda"})
	else:
	raise ValueError(f"Device {device} is not valid")
	elif MODEL == MODEL_MODELSCOPE:
	separation = pipeline(Tasks.speech_separation, model='damo/speech_mossformer_separation_temporal_8k', device=device)
	else:
	raise ValueError(f"Model {MODEL} is not valid")

	# Read files from input file
	with open(args.inputs_file, 'r') as f:
	inputs = f.read().splitlines()

	output_folder = "vocals"
	for input in inputs:
	if MODEL == MODEL_SPEECHBRAIN:
	separate_vocals_speechbrain(input, output_folder, model)
	elif MODEL == MODEL_MODELSCOPE:
	separate_vocals_modelscope(input, output_folder, separation)
	else:
	raise ValueError(f"Model {MODEL} is not valid")