YingMusic-Singer-Plus / infer_api.py
xjsc0's picture
11
ffbb4ab
"""
YingMusic Singer - Command Line Inference
==========================================
Single-sample inference script, replacing the Gradio Web UI.
Usage:
python infer.py \
--ref_audio path/to/ref.wav \
--melody_audio path/to/melody.wav \
--ref_text "该体谅的不执着|如果那天我" \
--target_text "好多天|看不完你" \
--output output.wav
# Enable vocal separation + accompaniment mixing simultaneously
python infer.py \
--ref_audio ref.wav \
--melody_audio melody.wav \
--ref_text "..." \
--target_text "..." \
--separate_vocals \
--mix_accompaniment \
--output mixed_output.wav
"""
import argparse
import os
import random
import tempfile
import torch
import torchaudio
from initialization import download_files
# ---------------------------------------------------------------------------
# Model loading (lazy singleton)
# ---------------------------------------------------------------------------
_model = None
_separator = None
def get_device():
return "cuda:0" if torch.cuda.is_available() else "cpu"
def get_model():
global _model
if _model is None:
download_files(task="infer")
from src.YingMusicSinger.infer.YingMusicSinger import YingMusicSinger
_model = YingMusicSinger.from_pretrained("ASLP-lab/YingMusic-Singer")
_model = _model.to(get_device())
_model.eval()
return _model
def get_separator():
global _separator
if _separator is None:
download_files(task="infer")
from src.third_party.MusicSourceSeparationTraining.inference_api import Separator
_separator = Separator(
config_path="ckpts/config_vocals_mel_band_roformer_kj.yaml",
checkpoint_path="ckpts/MelBandRoformer.ckpt",
)
return _separator
# ---------------------------------------------------------------------------
# Vocal separation
# ---------------------------------------------------------------------------
def separate_vocals(audio_path: str) -> tuple:
"""
Separate vocals and accompaniment, returns (vocals_path, accompaniment_path).
"""
separator = get_separator()
wav, sr = torchaudio.load(audio_path)
vocal_wav, inst_wav, out_sr = separator.separate(wav, sr)
tmp_dir = tempfile.mkdtemp()
vocals_path = os.path.join(tmp_dir, "vocals.wav")
accomp_path = os.path.join(tmp_dir, "accompaniment.wav")
torchaudio.save(vocals_path, torch.from_numpy(vocal_wav), out_sr)
torchaudio.save(accomp_path, torch.from_numpy(inst_wav), out_sr)
return vocals_path, accomp_path
# ---------------------------------------------------------------------------
# Mix vocals + accompaniment
# ---------------------------------------------------------------------------
def mix_vocal_and_accompaniment(vocal_path: str, accomp_path: str, vocal_gain: float = 1.0) -> str:
vocal_wav, vocal_sr = torchaudio.load(vocal_path)
accomp_wav, accomp_sr = torchaudio.load(accomp_path)
if accomp_sr != vocal_sr:
accomp_wav = torchaudio.functional.resample(accomp_wav, accomp_sr, vocal_sr)
if vocal_wav.shape[0] != accomp_wav.shape[0]:
if vocal_wav.shape[0] == 1:
vocal_wav = vocal_wav.expand(accomp_wav.shape[0], -1)
else:
accomp_wav = accomp_wav.expand(vocal_wav.shape[0], -1)
min_len = min(vocal_wav.shape[1], accomp_wav.shape[1])
mixed = vocal_wav[:, :min_len] * vocal_gain + accomp_wav[:, :min_len]
peak = mixed.abs().max()
if peak > 1.0:
mixed = mixed / peak
out_path = os.path.join(tempfile.mkdtemp(), "mixed_output.wav")
torchaudio.save(out_path, mixed, sample_rate=vocal_sr)
return out_path
# ---------------------------------------------------------------------------
# Main inference pipeline
# ---------------------------------------------------------------------------
def synthesize(args):
actual_seed = args.seed if args.seed >= 0 else random.randint(0, 2**31 - 1)
print(f"[INFO] Using seed: {actual_seed}")
actual_ref_path = args.ref_audio
actual_melody_path = args.melody_audio
melody_accomp_path = None
# Step 1: Vocal separation (optional)
if args.separate_vocals:
print("[INFO] Separating vocals from reference audio...")
actual_ref_path, _ = separate_vocals(args.ref_audio)
print("[INFO] Separating vocals from melody audio...")
actual_melody_path, melody_accomp_path = separate_vocals(args.melody_audio)
# Step 2: Model inference
print("[INFO] Loading model...")
model = get_model()
print("[INFO] Running synthesis...")
audio_tensor, sr = model(
ref_audio_path=actual_ref_path,
melody_audio_path=actual_melody_path,
ref_text=args.ref_text.strip(),
target_text=args.target_text.strip(),
lrc_align_mode="sentence_level",
sil_len_to_end=args.sil_len_to_end,
t_shift=args.t_shift,
nfe_step=args.nfe_step,
cfg_strength=args.cfg_strength,
seed=actual_seed,
)
vocal_out_path = os.path.join(tempfile.mkdtemp(), "vocal_output.wav")
torchaudio.save(vocal_out_path, audio_tensor.to("cpu"), sample_rate=sr)
# Step 3: Mix accompaniment (optional)
if args.separate_vocals and args.mix_accompaniment and melody_accomp_path is not None:
print("[INFO] Mixing vocals with accompaniment...")
final_path = mix_vocal_and_accompaniment(vocal_out_path, melody_accomp_path)
else:
final_path = vocal_out_path
# Write to specified output path
out_wav, out_sr = torchaudio.load(final_path)
os.makedirs(os.path.dirname(os.path.abspath(args.output)), exist_ok=True)
torchaudio.save(args.output, out_wav, sample_rate=out_sr)
print(f"[INFO] Saved to: {args.output}")
# ---------------------------------------------------------------------------
# Argument parser
# ---------------------------------------------------------------------------
def parse_args():
parser = argparse.ArgumentParser(
description="YingMusic Singer - Single sample command line inference"
)
# Required
parser.add_argument("--ref_audio", required=True,
help="Reference audio path")
parser.add_argument("--melody_audio", required=True,
help="Melody audio path")
parser.add_argument("--ref_text", required=True,
help="Reference lyrics, use | to separate phrases")
parser.add_argument("--target_text", required=True,
help="Target lyrics, use | to separate phrases")
# Output
parser.add_argument("--output", default="output.wav",
help="Output wav path (default: output.wav)")
# Optional flags
parser.add_argument("--separate_vocals", action="store_true",
help="Separate vocals before synthesis")
parser.add_argument("--mix_accompaniment", action="store_true",
help="Mix accompaniment into output (requires --separate_vocals)")
# Advanced params
parser.add_argument("--nfe_step", type=int, default=32,
help="NFE steps (default: 32)")
parser.add_argument("--cfg_strength", type=float, default=3.0,
help="CFG strength (default: 3.0)")
parser.add_argument("--t_shift", type=float, default=0.5,
help="t-shift (default: 0.5)")
parser.add_argument("--sil_len_to_end", type=float, default=0.5,
help="Silence padding in seconds (default: 0.5)")
parser.add_argument("--seed", type=int, default=-1,
help="Random seed, -1 for random (default: -1)")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
synthesize(args)