|
import os |
|
import tempfile |
|
import warnings |
|
warnings.filterwarnings("ignore") |
|
|
|
import gradio as gr |
|
import numpy as np |
|
import soundfile as sf |
|
import librosa |
|
|
|
from huggingface_hub import snapshot_download |
|
|
|
|
|
|
|
|
|
MODEL_DIR = os.path.join(os.getcwd(), "models") |
|
OPENVOICE_REPO = "myshell-ai/OpenVoiceV2" |
|
|
|
os.makedirs(MODEL_DIR, exist_ok=True) |
|
|
|
|
|
_openvoice_loaded = False |
|
_tone_converter = None |
|
_content_extractor = None |
|
|
|
_demucs_model = None |
|
|
|
def _ensure_openvoice(): |
|
global _openvoice_loaded, _tone_converter, _content_extractor |
|
if _openvoice_loaded: |
|
return |
|
|
|
local_dir = snapshot_download(repo_id=OPENVOICE_REPO, local_dir=os.path.join(MODEL_DIR, "openvoice"), local_dir_use_symlinks=False) |
|
|
|
|
|
import sys |
|
if local_dir not in sys.path: |
|
sys.path.append(local_dir) |
|
|
|
|
|
try: |
|
from openvoice import se_extractor |
|
from openvoice.api import ToneColorConverter, ContentVec |
|
except Exception: |
|
|
|
from tone_color_converter.api import ToneColorConverter |
|
from contentvec.api import ContentVec |
|
from se_extractor import se_extractor |
|
|
|
|
|
content_ckpt = os.path.join(local_dir, "checkpoints", "contentvec", "checkpoint.pth") |
|
_content_extractor = ContentVec(content_ckpt) |
|
|
|
|
|
tcc_ckpt = os.path.join(local_dir, "checkpoints", "tone_color_converter", "checkpoint.pth") |
|
_tone_converter = ToneColorConverter(tcc_ckpt, device=os.environ.get("DEVICE", "cuda" if gr.cuda.is_available() else "cpu")) |
|
|
|
_openvoice_loaded = True |
|
|
|
|
|
def _ensure_demucs(): |
|
global _demucs_model |
|
if _demucs_model is not None: |
|
return |
|
from demucs.apply import apply_model |
|
from demucs.pretrained import get_model |
|
from demucs.audio import AudioFile |
|
_demucs_model = { |
|
"apply_model": apply_model, |
|
"get_model": get_model, |
|
"AudioFile": AudioFile, |
|
} |
|
|
|
|
|
def separate_vocals(wav_path, stem="vocals"): |
|
"""Return path to separated vocals and accompaniment using htdemucs.""" |
|
_ensure_demucs() |
|
apply_model = _demucs_model["apply_model"] |
|
get_model = _demucs_model["get_model"] |
|
AudioFile = _demucs_model["AudioFile"] |
|
|
|
model = get_model(name="htdemucs") |
|
model.cpu() |
|
|
|
with AudioFile(wav_path).read(streams=0, samplerate=44100, channels=2) as mix: |
|
ref = mix |
|
out = apply_model(model, ref, shifts=1, split=True, overlap=0.25) |
|
sources = {name: out[idx] for idx, name in enumerate(model.sources)} |
|
|
|
|
|
base = os.path.splitext(os.path.basename(wav_path))[0] |
|
out_dir = tempfile.mkdtemp(prefix="stems_") |
|
vocal_path = os.path.join(out_dir, f"{base}_vocals.wav") |
|
inst_path = os.path.join(out_dir, f"{base}_inst.wav") |
|
|
|
sf.write(vocal_path, sources["vocals"].T, 44100) |
|
|
|
inst = sum([v for k, v in sources.items() if k != "vocals"]) / (len(model.sources) - 1) |
|
sf.write(inst_path, inst.T, 44100) |
|
return vocal_path, inst_path |
|
|
|
|
|
def load_audio(x, sr=44100, mono=True): |
|
y, _sr = librosa.load(x, sr=sr, mono=mono) |
|
return y, sr |
|
|
|
|
|
def save_audio(y, sr): |
|
path = tempfile.mktemp(suffix=".wav") |
|
sf.write(path, y, sr) |
|
return path |
|
|
|
|
|
def match_length(a, b): |
|
|
|
if len(a) < len(b): |
|
a = np.pad(a, (0, len(b)-len(a))) |
|
else: |
|
a = a[:len(b)] |
|
return a |
|
|
|
|
|
def convert_voice(reference_wav, source_vocal_wav, style_strength=0.8, pitch_shift=0.0, formant_shift=0.0): |
|
_ensure_openvoice() |
|
|
|
|
|
ref, sr = load_audio(reference_wav, sr=16000, mono=True) |
|
src, _ = load_audio(source_vocal_wav, sr=16000, mono=True) |
|
|
|
|
|
content = _content_extractor.extract(src, sr) |
|
|
|
|
|
|
|
try: |
|
from openvoice import se_extractor |
|
se = se_extractor.get_se(reference_wav, device=_tone_converter.device) |
|
except Exception: |
|
|
|
from se_extractor import get_se |
|
se = get_se(reference_wav) |
|
|
|
|
|
converted = _tone_converter.convert(content, se, style_strength=style_strength) |
|
|
|
y = converted |
|
|
|
|
|
if abs(pitch_shift) > 1e-3: |
|
y = librosa.effects.pitch_shift(y.astype(np.float32), 16000, n_steps=pitch_shift) |
|
if abs(formant_shift) > 1e-3: |
|
|
|
import scipy.signal as sps |
|
w = 2 * np.pi * 1500 / 16000 |
|
b, a = sps.iirfilter(2, Wn=w/np.pi, btype='high', ftype='but |