Spaces:

USF00
/

Text_to_Video_Demo

Configuration error

App Files Files Community

Text_to_Video_Demo / preprocessing /extract_vocals.py

USF00

Initial commit

2b67076 about 1 month ago

raw

history blame

2.28 kB

	from pathlib import Path
	import os, tempfile
	import numpy as np
	import soundfile as sf
	import librosa
	import torch
	import gc

	from audio_separator.separator import Separator

	def get_vocals(src_path: str, dst_path: str, min_seconds: float = 8) -> str:
	"""
	If the source audio is shorter than `min_seconds`, pad with trailing silence
	in a temporary file, then run separation and save only the vocals to dst_path.
	Returns the full path to the vocals file.
	"""

	default_device = torch.get_default_device()
	torch.set_default_device('cpu')

	dst = Path(dst_path)
	dst.parent.mkdir(parents=True, exist_ok=True)

	# Quick duration check
	duration = librosa.get_duration(path=src_path)

	use_path = src_path
	temp_path = None
	try:
	if duration < min_seconds:
	# Load (resample) and pad in memory
	y, sr = librosa.load(src_path, sr=None, mono=False)
	if y.ndim == 1: # ensure shape (channels, samples)
	y = y[np.newaxis, :]
	target_len = int(min_seconds * sr)
	pad = max(0, target_len - y.shape[1])
	if pad:
	y = np.pad(y, ((0, 0), (0, pad)), mode="constant")

	# Write a temp WAV for the separator
	fd, temp_path = tempfile.mkstemp(suffix=".wav")
	os.close(fd)
	sf.write(temp_path, y.T, sr) # soundfile expects (frames, channels)
	use_path = temp_path

	# Run separation: emit only the vocals, with your exact filename
	sep = Separator(
	output_dir=str(dst.parent),
	output_format=(dst.suffix.lstrip(".") or "wav"),
	output_single_stem="Vocals",
	model_file_dir="ckpts/roformer/" #model_bs_roformer_ep_317_sdr_12.9755.ckpt"
	)
	sep.load_model()
	out_files = sep.separate(use_path, {"Vocals": dst.stem})

	out = Path(out_files[0])
	return str(out if out.is_absolute() else (dst.parent / out))
	finally:
	if temp_path and os.path.exists(temp_path):
	os.remove(temp_path)

	torch.cuda.empty_cache()
	gc.collect()
	torch.set_default_device(default_device)

	# Example:
	# final = extract_vocals("in/clip.mp3", "out/vocals.wav")
	# print(final)