Text_to_Video_Demo / preprocessing /extract_vocals.py
USF00's picture
Initial commit
2b67076
raw
history blame
2.28 kB
from pathlib import Path
import os, tempfile
import numpy as np
import soundfile as sf
import librosa
import torch
import gc
from audio_separator.separator import Separator
def get_vocals(src_path: str, dst_path: str, min_seconds: float = 8) -> str:
"""
If the source audio is shorter than `min_seconds`, pad with trailing silence
in a temporary file, then run separation and save only the vocals to dst_path.
Returns the full path to the vocals file.
"""
default_device = torch.get_default_device()
torch.set_default_device('cpu')
dst = Path(dst_path)
dst.parent.mkdir(parents=True, exist_ok=True)
# Quick duration check
duration = librosa.get_duration(path=src_path)
use_path = src_path
temp_path = None
try:
if duration < min_seconds:
# Load (resample) and pad in memory
y, sr = librosa.load(src_path, sr=None, mono=False)
if y.ndim == 1: # ensure shape (channels, samples)
y = y[np.newaxis, :]
target_len = int(min_seconds * sr)
pad = max(0, target_len - y.shape[1])
if pad:
y = np.pad(y, ((0, 0), (0, pad)), mode="constant")
# Write a temp WAV for the separator
fd, temp_path = tempfile.mkstemp(suffix=".wav")
os.close(fd)
sf.write(temp_path, y.T, sr) # soundfile expects (frames, channels)
use_path = temp_path
# Run separation: emit only the vocals, with your exact filename
sep = Separator(
output_dir=str(dst.parent),
output_format=(dst.suffix.lstrip(".") or "wav"),
output_single_stem="Vocals",
model_file_dir="ckpts/roformer/" #model_bs_roformer_ep_317_sdr_12.9755.ckpt"
)
sep.load_model()
out_files = sep.separate(use_path, {"Vocals": dst.stem})
out = Path(out_files[0])
return str(out if out.is_absolute() else (dst.parent / out))
finally:
if temp_path and os.path.exists(temp_path):
os.remove(temp_path)
torch.cuda.empty_cache()
gc.collect()
torch.set_default_device(default_device)
# Example:
# final = extract_vocals("in/clip.mp3", "out/vocals.wav")
# print(final)