Spaces:
Running on Zero
Running on Zero
| """ | |
| YingMusic Singer - Command Line Inference | |
| ========================================== | |
| Single-sample inference script, replacing the Gradio Web UI. | |
| Usage: | |
| python infer.py \ | |
| --ref_audio path/to/ref.wav \ | |
| --melody_audio path/to/melody.wav \ | |
| --ref_text "该体谅的不执着|如果那天我" \ | |
| --target_text "好多天|看不完你" \ | |
| --output output.wav | |
| # Enable vocal separation + accompaniment mixing simultaneously | |
| python infer.py \ | |
| --ref_audio ref.wav \ | |
| --melody_audio melody.wav \ | |
| --ref_text "..." \ | |
| --target_text "..." \ | |
| --separate_vocals \ | |
| --mix_accompaniment \ | |
| --output mixed_output.wav | |
| """ | |
| import argparse | |
| import os | |
| import random | |
| import tempfile | |
| import torch | |
| import torchaudio | |
| from initialization import download_files | |
| # --------------------------------------------------------------------------- | |
| # Model loading (lazy singleton) | |
| # --------------------------------------------------------------------------- | |
| _model = None | |
| _separator = None | |
| def get_device(): | |
| return "cuda:0" if torch.cuda.is_available() else "cpu" | |
| def get_model(): | |
| global _model | |
| if _model is None: | |
| download_files(task="infer") | |
| from src.YingMusicSinger.infer.YingMusicSinger import YingMusicSinger | |
| _model = YingMusicSinger.from_pretrained("ASLP-lab/YingMusic-Singer") | |
| _model = _model.to(get_device()) | |
| _model.eval() | |
| return _model | |
| def get_separator(): | |
| global _separator | |
| if _separator is None: | |
| download_files(task="infer") | |
| from src.third_party.MusicSourceSeparationTraining.inference_api import Separator | |
| _separator = Separator( | |
| config_path="ckpts/config_vocals_mel_band_roformer_kj.yaml", | |
| checkpoint_path="ckpts/MelBandRoformer.ckpt", | |
| ) | |
| return _separator | |
| # --------------------------------------------------------------------------- | |
| # Vocal separation | |
| # --------------------------------------------------------------------------- | |
| def separate_vocals(audio_path: str) -> tuple: | |
| """ | |
| Separate vocals and accompaniment, returns (vocals_path, accompaniment_path). | |
| """ | |
| separator = get_separator() | |
| wav, sr = torchaudio.load(audio_path) | |
| vocal_wav, inst_wav, out_sr = separator.separate(wav, sr) | |
| tmp_dir = tempfile.mkdtemp() | |
| vocals_path = os.path.join(tmp_dir, "vocals.wav") | |
| accomp_path = os.path.join(tmp_dir, "accompaniment.wav") | |
| torchaudio.save(vocals_path, torch.from_numpy(vocal_wav), out_sr) | |
| torchaudio.save(accomp_path, torch.from_numpy(inst_wav), out_sr) | |
| return vocals_path, accomp_path | |
| # --------------------------------------------------------------------------- | |
| # Mix vocals + accompaniment | |
| # --------------------------------------------------------------------------- | |
| def mix_vocal_and_accompaniment(vocal_path: str, accomp_path: str, vocal_gain: float = 1.0) -> str: | |
| vocal_wav, vocal_sr = torchaudio.load(vocal_path) | |
| accomp_wav, accomp_sr = torchaudio.load(accomp_path) | |
| if accomp_sr != vocal_sr: | |
| accomp_wav = torchaudio.functional.resample(accomp_wav, accomp_sr, vocal_sr) | |
| if vocal_wav.shape[0] != accomp_wav.shape[0]: | |
| if vocal_wav.shape[0] == 1: | |
| vocal_wav = vocal_wav.expand(accomp_wav.shape[0], -1) | |
| else: | |
| accomp_wav = accomp_wav.expand(vocal_wav.shape[0], -1) | |
| min_len = min(vocal_wav.shape[1], accomp_wav.shape[1]) | |
| mixed = vocal_wav[:, :min_len] * vocal_gain + accomp_wav[:, :min_len] | |
| peak = mixed.abs().max() | |
| if peak > 1.0: | |
| mixed = mixed / peak | |
| out_path = os.path.join(tempfile.mkdtemp(), "mixed_output.wav") | |
| torchaudio.save(out_path, mixed, sample_rate=vocal_sr) | |
| return out_path | |
| # --------------------------------------------------------------------------- | |
| # Main inference pipeline | |
| # --------------------------------------------------------------------------- | |
| def synthesize(args): | |
| actual_seed = args.seed if args.seed >= 0 else random.randint(0, 2**31 - 1) | |
| print(f"[INFO] Using seed: {actual_seed}") | |
| actual_ref_path = args.ref_audio | |
| actual_melody_path = args.melody_audio | |
| melody_accomp_path = None | |
| # Step 1: Vocal separation (optional) | |
| if args.separate_vocals: | |
| print("[INFO] Separating vocals from reference audio...") | |
| actual_ref_path, _ = separate_vocals(args.ref_audio) | |
| print("[INFO] Separating vocals from melody audio...") | |
| actual_melody_path, melody_accomp_path = separate_vocals(args.melody_audio) | |
| # Step 2: Model inference | |
| print("[INFO] Loading model...") | |
| model = get_model() | |
| print("[INFO] Running synthesis...") | |
| audio_tensor, sr = model( | |
| ref_audio_path=actual_ref_path, | |
| melody_audio_path=actual_melody_path, | |
| ref_text=args.ref_text.strip(), | |
| target_text=args.target_text.strip(), | |
| lrc_align_mode="sentence_level", | |
| sil_len_to_end=args.sil_len_to_end, | |
| t_shift=args.t_shift, | |
| nfe_step=args.nfe_step, | |
| cfg_strength=args.cfg_strength, | |
| seed=actual_seed, | |
| ) | |
| vocal_out_path = os.path.join(tempfile.mkdtemp(), "vocal_output.wav") | |
| torchaudio.save(vocal_out_path, audio_tensor.to("cpu"), sample_rate=sr) | |
| # Step 3: Mix accompaniment (optional) | |
| if args.separate_vocals and args.mix_accompaniment and melody_accomp_path is not None: | |
| print("[INFO] Mixing vocals with accompaniment...") | |
| final_path = mix_vocal_and_accompaniment(vocal_out_path, melody_accomp_path) | |
| else: | |
| final_path = vocal_out_path | |
| # Write to specified output path | |
| out_wav, out_sr = torchaudio.load(final_path) | |
| os.makedirs(os.path.dirname(os.path.abspath(args.output)), exist_ok=True) | |
| torchaudio.save(args.output, out_wav, sample_rate=out_sr) | |
| print(f"[INFO] Saved to: {args.output}") | |
| # --------------------------------------------------------------------------- | |
| # Argument parser | |
| # --------------------------------------------------------------------------- | |
| def parse_args(): | |
| parser = argparse.ArgumentParser( | |
| description="YingMusic Singer - Single sample command line inference" | |
| ) | |
| # Required | |
| parser.add_argument("--ref_audio", required=True, | |
| help="Reference audio path") | |
| parser.add_argument("--melody_audio", required=True, | |
| help="Melody audio path") | |
| parser.add_argument("--ref_text", required=True, | |
| help="Reference lyrics, use | to separate phrases") | |
| parser.add_argument("--target_text", required=True, | |
| help="Target lyrics, use | to separate phrases") | |
| # Output | |
| parser.add_argument("--output", default="output.wav", | |
| help="Output wav path (default: output.wav)") | |
| # Optional flags | |
| parser.add_argument("--separate_vocals", action="store_true", | |
| help="Separate vocals before synthesis") | |
| parser.add_argument("--mix_accompaniment", action="store_true", | |
| help="Mix accompaniment into output (requires --separate_vocals)") | |
| # Advanced params | |
| parser.add_argument("--nfe_step", type=int, default=32, | |
| help="NFE steps (default: 32)") | |
| parser.add_argument("--cfg_strength", type=float, default=3.0, | |
| help="CFG strength (default: 3.0)") | |
| parser.add_argument("--t_shift", type=float, default=0.5, | |
| help="t-shift (default: 0.5)") | |
| parser.add_argument("--sil_len_to_end", type=float, default=0.5, | |
| help="Silence padding in seconds (default: 0.5)") | |
| parser.add_argument("--seed", type=int, default=-1, | |
| help="Random seed, -1 for random (default: -1)") | |
| return parser.parse_args() | |
| if __name__ == "__main__": | |
| args = parse_args() | |
| synthesize(args) |