vidfom
/

wan2gp

Model card Files Files and versions

wan2gp / preprocessing /extract_vocals.py

vidfom's picture

Upload folder using huggingface_hub

618f472 verified 5 months ago

history blame contribute delete

2.28 kB

	from pathlib import Path
	import os, tempfile
	import numpy as np
	import soundfile as sf
	import librosa
	import torch
	import gc

	from audio_separator.separator import Separator

	def get_vocals(src_path: str, dst_path: str, min_seconds: float = 8) -> str:
	"""
	If the source audio is shorter than `min_seconds`, pad with trailing silence
	in a temporary file, then run separation and save only the vocals to dst_path.
	Returns the full path to the vocals file.
	"""

	default_device = torch.get_default_device()
	torch.set_default_device('cpu')

	dst = Path(dst_path)
	dst.parent.mkdir(parents=True, exist_ok=True)

	# Quick duration check
	duration = librosa.get_duration(path=src_path)

	use_path = src_path
	temp_path = None
	try:
	if duration < min_seconds:
	# Load (resample) and pad in memory
	y, sr = librosa.load(src_path, sr=None, mono=False)
	if y.ndim == 1: # ensure shape (channels, samples)
	y = y[np.newaxis, :]
	target_len = int(min_seconds * sr)
	pad = max(0, target_len - y.shape[1])
	if pad:
	y = np.pad(y, ((0, 0), (0, pad)), mode="constant")

	# Write a temp WAV for the separator
	fd, temp_path = tempfile.mkstemp(suffix=".wav")
	os.close(fd)
	sf.write(temp_path, y.T, sr) # soundfile expects (frames, channels)
	use_path = temp_path

	# Run separation: emit only the vocals, with your exact filename
	sep = Separator(
	output_dir=str(dst.parent),
	output_format=(dst.suffix.lstrip(".") or "wav"),
	output_single_stem="Vocals",
	model_file_dir="ckpts/roformer/" #model_bs_roformer_ep_317_sdr_12.9755.ckpt"
	)
	sep.load_model()
	out_files = sep.separate(use_path, {"Vocals": dst.stem})

	out = Path(out_files[0])
	return str(out if out.is_absolute() else (dst.parent / out))
	finally:
	if temp_path and os.path.exists(temp_path):
	os.remove(temp_path)

	torch.cuda.empty_cache()
	gc.collect()
	torch.set_default_device(default_device)

	# Example:
	# final = extract_vocals("in/clip.mp3", "out/vocals.wav")
	# print(final)