Spaces:
Running on Zero
Running on Zero
| """ | |
| YingMusic Singer - Gradio Web Interface | |
| ======================================== | |
| 基于参考音色与旋律音频的歌声合成系统,支持自动分离人声与伴奏。 | |
| A singing voice synthesis system powered by YingMusicSinger, | |
| with built-in vocal/accompaniment separation via MelBandRoformer. | |
| """ | |
| try: | |
| import spaces | |
| except ImportError: | |
| spaces = None | |
| import os | |
| import tempfile | |
| import gradio as gr | |
| import torch | |
| import torchaudio | |
| from initialization import download_files | |
| IS_HF_SPACE = os.environ.get("SPACE_ID") is not None | |
| HF_ENABLE = False | |
| LOCAL_DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| def local_move2gpu(x): | |
| """Move models to GPU on local environment. No-op on HuggingFace Spaces (ZeroGPU handles it).""" | |
| if IS_HF_SPACE: | |
| return x | |
| return x.to(LOCAL_DEVICE) | |
| # --------------------------------------------------------------------------- | |
| # Model loading (lazy, singleton) / 模型懒加载(单例) | |
| # --------------------------------------------------------------------------- | |
| _model = None | |
| _separator = None | |
| def _load_model_impl(): | |
| """Internal: load YingMusicSinger (no GPU decorator, called inside GPU context).""" | |
| download_files(task="infer") | |
| global _model | |
| if _model is None: | |
| from src.YingMusicSinger.infer.YingMusicSinger import YingMusicSinger | |
| _model = YingMusicSinger.from_pretrained("ASLP-lab/YingMusic-Singer") | |
| _model = local_move2gpu(_model) | |
| _model.eval() | |
| return _model | |
| def _load_separator_impl(): | |
| """Internal: load MelBandRoformer separator (no GPU decorator, called inside GPU context).""" | |
| download_files(task="infer") | |
| global _separator | |
| if _separator is None: | |
| from src.third_party.MusicSourceSeparationTraining.inference_api import Separator | |
| _separator = Separator( | |
| config_path="ckpts/config_vocals_mel_band_roformer_kj.yaml", | |
| checkpoint_path="ckpts/MelBandRoformer.ckpt", | |
| ) | |
| return _separator | |
| # --------------------------------------------------------------------------- | |
| # Vocal separation utilities / 人声分离工具 | |
| # --------------------------------------------------------------------------- | |
| def _separate_vocals_impl(audio_path: str) -> tuple: | |
| """ | |
| Separate audio into vocals and accompaniment using MelBandRoformer. | |
| Must be called within an active GPU context. | |
| """ | |
| separator = _load_separator_impl() | |
| wav, sr = torchaudio.load(audio_path) | |
| vocal_wav, inst_wav, out_sr = separator.separate(wav, sr) | |
| tmp_dir = tempfile.mkdtemp() | |
| vocals_path = os.path.join(tmp_dir, "vocals.wav") | |
| accomp_path = os.path.join(tmp_dir, "accompaniment.wav") | |
| torchaudio.save(vocals_path, torch.from_numpy(vocal_wav), out_sr) | |
| torchaudio.save(accomp_path, torch.from_numpy(inst_wav), out_sr) | |
| return vocals_path, accomp_path | |
| def mix_vocal_and_accompaniment( | |
| vocal_path: str, | |
| accomp_path: str, | |
| vocal_gain: float = 1.0, | |
| ) -> str: | |
| """ | |
| 将合成人声与伴奏混合为最终音频。 | |
| Mix synthesised vocals with accompaniment into a final audio file. | |
| """ | |
| vocal_wav, vocal_sr = torchaudio.load(vocal_path) | |
| accomp_wav, accomp_sr = torchaudio.load(accomp_path) | |
| if accomp_sr != vocal_sr: | |
| accomp_wav = torchaudio.functional.resample(accomp_wav, accomp_sr, vocal_sr) | |
| if vocal_wav.shape[0] != accomp_wav.shape[0]: | |
| if vocal_wav.shape[0] == 1: | |
| vocal_wav = vocal_wav.expand(accomp_wav.shape[0], -1) | |
| else: | |
| accomp_wav = accomp_wav.expand(vocal_wav.shape[0], -1) | |
| min_len = min(vocal_wav.shape[1], accomp_wav.shape[1]) | |
| vocal_wav = vocal_wav[:, :min_len] | |
| accomp_wav = accomp_wav[:, :min_len] | |
| mixed = vocal_wav * vocal_gain + accomp_wav | |
| peak = mixed.abs().max() | |
| if peak > 1.0: | |
| mixed = mixed / peak | |
| out_path = os.path.join(tempfile.mkdtemp(), "mixed_output.wav") | |
| torchaudio.save(out_path, mixed, sample_rate=vocal_sr) | |
| return out_path | |
| # --------------------------------------------------------------------------- | |
| # Inference wrapper / 推理入口 | |
| # Single @spaces.GPU scope covers ALL heavy work (separation + synthesis) | |
| # so models stay resident in GPU memory across steps within one call. | |
| # --------------------------------------------------------------------------- | |
| def synthesize( | |
| ref_audio, | |
| melody_audio, | |
| ref_text, | |
| target_text, | |
| separate_vocals_flag, | |
| mix_accompaniment_flag, | |
| sil_len_to_end, | |
| t_shift, | |
| nfe_step, | |
| cfg_strength, | |
| seed, | |
| ): | |
| """ | |
| 主合成流程 / Main synthesis pipeline. | |
| 1. (可选) 用 MelBandRoformer 分离参考音频和旋律音频的人声与伴奏 | |
| 2. 送入 YingMusicSinger 合成 | |
| 3. (可选) 将合成人声与旋律音频的伴奏混合 | |
| """ | |
| import random | |
| # ---- 输入校验 / Input validation ---------------------------------------- | |
| if ref_audio is None: | |
| raise gr.Error("请上传参考音频 / Please upload Reference Audio") | |
| if melody_audio is None: | |
| raise gr.Error("请上传旋律音频 / Please upload Melody Audio") | |
| if not ref_text.strip(): | |
| raise gr.Error("请输入参考音频对应的歌词 / Please enter Reference Text") | |
| if not target_text.strip(): | |
| raise gr.Error("请输入目标合成歌词 / Please enter Target Text") | |
| ref_audio_path = ref_audio if isinstance(ref_audio, str) else ref_audio[0] | |
| melody_audio_path = ( | |
| melody_audio if isinstance(melody_audio, str) else melody_audio[0] | |
| ) | |
| actual_seed = int(seed) | |
| if actual_seed < 0: | |
| actual_seed = random.randint(0, 2**31 - 1) | |
| # ---- Step 1: 人声分离(合并在同一 GPU 上下文中)/ Vocal separation (same GPU context) ---- | |
| melody_accomp_path = None | |
| actual_ref_path = ref_audio_path | |
| actual_melody_path = melody_audio_path | |
| if separate_vocals_flag: | |
| ref_vocals_path, _ = _separate_vocals_impl(ref_audio_path) | |
| actual_ref_path = ref_vocals_path | |
| melody_vocals_path, melody_accomp_path = _separate_vocals_impl(melody_audio_path) | |
| actual_melody_path = melody_vocals_path | |
| # ---- Step 2: 模型推理 / Model inference (same GPU context) --------------- | |
| model = _load_model_impl() | |
| audio_tensor, sr = model( | |
| ref_audio_path=actual_ref_path, | |
| melody_audio_path=actual_melody_path, | |
| ref_text=ref_text.strip(), | |
| target_text=target_text.strip(), | |
| lrc_align_mode="sentence_level", | |
| sil_len_to_end=float(sil_len_to_end), | |
| t_shift=float(t_shift), | |
| nfe_step=int(nfe_step), | |
| cfg_strength=float(cfg_strength), | |
| seed=actual_seed, | |
| ) | |
| vocal_out_path = os.path.join(tempfile.mkdtemp(), "vocal_output.wav") | |
| torchaudio.save(vocal_out_path, audio_tensor.to("cpu"), sample_rate=sr) | |
| # ---- Step 3: 混合伴奏 / Mix accompaniment (optional) --------------------- | |
| if ( | |
| separate_vocals_flag | |
| and mix_accompaniment_flag | |
| and melody_accomp_path is not None | |
| ): | |
| final_path = mix_vocal_and_accompaniment(vocal_out_path, melody_accomp_path) | |
| return final_path | |
| else: | |
| return vocal_out_path | |
| # --------------------------------------------------------------------------- | |
| # Example presets / 预设示例 | |
| # --------------------------------------------------------------------------- | |
| EXAMPLES_MELODY_CONTROL = [ | |
| # [ref_audio, melody_audio, ref_text, target_text, sep, mix, sil, t_shift, nfe, cfg, seed] | |
| [ | |
| "examples/melody_control/ref_01.wav", | |
| "examples/melody_control/melody_01.wav", | |
| "该体谅的不执着|如果那天我", | |
| "好多天|看不完你", | |
| True, False, 0.5, 0.5, 32, 3.0, -1, | |
| ], | |
| [ | |
| "examples/melody_control/ref_02.wav", | |
| "examples/melody_control/melody_02.wav", | |
| "月光下的身影|渐渐模糊", | |
| "星光照亮前路|指引方向", | |
| True, False, 0.5, 0.5, 32, 3.0, -1, | |
| ], | |
| ] | |
| EXAMPLES_LYRIC_EDIT = [ | |
| [ | |
| "examples/lyric_edit/ref_01.wav", | |
| "examples/lyric_edit/melody_01.wav", | |
| "该体谅的不执着|如果那天我", | |
| "忘不掉的笑容|留在心里面", | |
| True, False, 0.5, 0.5, 32, 3.0, -1, | |
| ], | |
| [ | |
| "examples/lyric_edit/ref_02.wav", | |
| "examples/lyric_edit/melody_02.wav", | |
| "夜深了还不睡|想着你的脸", | |
| "春风又吹过来|带走我思念", | |
| True, False, 0.5, 0.5, 32, 3.0, -1, | |
| ], | |
| ] | |
| # --------------------------------------------------------------------------- | |
| # Custom CSS / 自定义样式 | |
| # --------------------------------------------------------------------------- | |
| CUSTOM_CSS = """ | |
| @import url('https://fonts.googleapis.com/css2?family=DM+Sans:ital,opsz,wght@0,9..40,300;0,9..40,500;0,9..40,700;1,9..40,400&family=Playfair+Display:wght@600;800&display=swap'); | |
| :root { | |
| --primary: #e85d04; | |
| --primary-light: #f48c06; | |
| --bg-dark: #0d1117; | |
| --surface: #161b22; | |
| --surface-light: #21262d; | |
| --text: #f0f6fc; | |
| --text-muted: #8b949e; | |
| --accent-glow: rgba(232, 93, 4, 0.15); | |
| --border: #30363d; | |
| } | |
| .gradio-container { | |
| font-family: 'DM Sans', sans-serif !important; | |
| max-width: 1100px !important; | |
| margin: auto !important; | |
| } | |
| /* ---------- Badge links: no underline, no gap artifacts ---------- */ | |
| #app-header .badges a { | |
| text-decoration: none !important; | |
| display: inline-block; | |
| line-height: 0; | |
| margin: 3px 2px; | |
| } | |
| #app-header .badges a img, | |
| #app-header .badges > img { | |
| display: inline-block; | |
| vertical-align: middle; | |
| margin: 0; | |
| } | |
| #app-header .badges { | |
| line-height: 1.8; | |
| } | |
| /* ---------- Header / 头部 ---------- */ | |
| #app-header { | |
| text-align: center; | |
| padding: 1.8rem 1rem 0.5rem; | |
| } | |
| #app-header h1 { | |
| font-size: 1.45rem !important; | |
| font-weight: 700 !important; | |
| line-height: 1.4; | |
| margin-bottom: 0.6rem !important; | |
| } | |
| #app-header .badges img { | |
| display: inline-block; | |
| margin: 3px 2px; | |
| vertical-align: middle; | |
| } | |
| #app-header .authors { | |
| color: var(--text-muted); | |
| font-size: 0.92rem; | |
| margin: 0.5rem 0 0.2rem; | |
| line-height: 1.7; | |
| } | |
| #app-header .affiliations { | |
| color: var(--text-muted); | |
| font-size: 0.85rem; | |
| margin-bottom: 0.5rem; | |
| } | |
| #app-header .lang-links a { | |
| color: var(--primary-light); | |
| text-decoration: none; | |
| margin: 0 4px; | |
| font-size: 0.9rem; | |
| } | |
| #app-header .lang-links a:hover { text-decoration: underline; } | |
| /* ---------- Disclaimer ---------- */ | |
| #disclaimer { | |
| border-top: 1px solid var(--border); | |
| margin: 24px 0 4px; | |
| padding: 14px 4px 4px; | |
| font-size: 0.80rem; | |
| color: #6e7681; | |
| line-height: 1.65; | |
| } | |
| #disclaimer strong { | |
| color: #8b949e; | |
| font-weight: 600; | |
| } | |
| /* ---------- Section labels / 分区标题 ---------- */ | |
| .section-title { | |
| font-family: 'DM Sans', sans-serif !important; | |
| font-weight: 700 !important; | |
| font-size: 1rem !important; | |
| letter-spacing: 0.06em; | |
| text-transform: uppercase; | |
| color: var(--primary-light) !important; | |
| border-bottom: 2px solid var(--primary); | |
| padding-bottom: 6px; | |
| margin-bottom: 12px !important; | |
| } | |
| /* ---------- Example tabs ---------- */ | |
| .example-tab-label { | |
| font-weight: 600 !important; | |
| font-size: 0.95rem !important; | |
| } | |
| /* ---------- Run button / 合成按钮 ---------- */ | |
| #run-btn { | |
| background: linear-gradient(135deg, #e85d04, #dc2f02) !important; | |
| border: none !important; | |
| color: #fff !important; | |
| font-weight: 700 !important; | |
| font-size: 1.1rem !important; | |
| letter-spacing: 0.04em; | |
| padding: 12px 0 !important; | |
| border-radius: 10px !important; | |
| transition: transform 0.15s, box-shadow 0.25s !important; | |
| box-shadow: 0 4px 20px rgba(232, 93, 4, 0.35) !important; | |
| } | |
| #run-btn:hover { | |
| transform: translateY(-1px) !important; | |
| box-shadow: 0 6px 28px rgba(232, 93, 4, 0.5) !important; | |
| } | |
| /* ---------- Output audio / 输出音频 ---------- */ | |
| #output-audio { | |
| border: 2px solid var(--primary) !important; | |
| border-radius: 12px !important; | |
| background: var(--accent-glow) !important; | |
| } | |
| """ | |
| # --------------------------------------------------------------------------- | |
| # Header HTML / 头部 HTML | |
| # --------------------------------------------------------------------------- | |
| HEADER_HTML = """ | |
| <div id="app-header" align="center"> | |
| <h1> | |
| 🎤 YingMusic-Singer: Controllable Singing Voice Synthesis with Flexible Lyric Manipulation and Annotation-free Melody Guidance | |
| </h1> | |
| <div class="badges" style="margin: 10px 0;"> | |
| <img src="https://img.shields.io/badge/Python-3.10-3776AB?logo=python&logoColor=white" alt="Python"> | |
| <img src="https://img.shields.io/badge/License-CC%20BY--NC--SA%204.0-lightgrey" alt="License"> | |
| <a href="https://arxiv.org/abs/0.0" target="_blank"> | |
| <img src="https://img.shields.io/badge/arXiv-0.0-b31b1b?logo=arxiv&logoColor=white" alt="arXiv"> | |
| </a> | |
| <a href="https://github.com/ASLP-lab/YingMusic-Singer" target="_blank"> | |
| <img src="https://img.shields.io/badge/GitHub-YingMusic--Singer-181717?logo=github&logoColor=white" alt="GitHub"> | |
| </a> | |
| <a href="https://huggingface.co/spaces/ASLP-lab/YingMusic-Singer" target="_blank"> | |
| <img src="https://img.shields.io/badge/🤗%20HuggingFace-Space-FFD21E" alt="HuggingFace Space"> | |
| </a> | |
| <a href="https://huggingface.co/ASLP-lab/YingMusic-Singer" target="_blank"> | |
| <img src="https://img.shields.io/badge/🤗%20HuggingFace-Model-FF9D00" alt="HuggingFace Model"> | |
| </a> | |
| <a href="https://huggingface.co/datasets/ASLP-lab/LyricEditBench" target="_blank"> | |
| <img src="https://img.shields.io/badge/🤗%20HuggingFace-LyricEditBench-FF6F00" alt="LyricEditBench"> | |
| </a> | |
| <a href="https://discord.gg/RXghgWyvrn" target="_blank"> | |
| <img src="https://img.shields.io/badge/Discord-Join%20Us-5865F2?logo=discord&logoColor=white" alt="Discord"> | |
| </a> | |
| <a href="https://github.com/ASLP-lab/YingMusic-Singer/blob/main/assets/wechat_qr.png" target="_blank"> | |
| <img src="https://img.shields.io/badge/WeChat-Group-07C160?logo=wechat&logoColor=white" alt="WeChat"> | |
| </a> | |
| <a href="http://www.npu-aslp.org/" target="_blank"> | |
| <img src="https://img.shields.io/badge/🏫%20ASLP-Lab-4A90D9" alt="ASLP Lab"> | |
| </a> | |
| </div> | |
| <p class="authors"> | |
| <a href="https://orcid.org/0009-0005-5957-8936" target="_blank"><b>Chunbo Hao</b></a>¹² · | |
| <a href="https://orcid.org/0009-0003-2602-2910" target="_blank"><b>Junjie Zheng</b></a>² · | |
| <a href="https://orcid.org/0009-0001-6706-0572" target="_blank"><b>Guobin Ma</b></a>¹ · | |
| <b>Yuepeng Jiang</b>¹ · | |
| <b>Huakang Chen</b>¹ · | |
| <b>Wenjie Tian</b>¹ · | |
| <a href="https://orcid.org/0009-0003-9258-4006" target="_blank"><b>Gongyu Chen</b></a>² · | |
| <a href="https://orcid.org/0009-0005-5413-6725" target="_blank"><b>Zihao Chen</b></a>² · | |
| <b>Lei Xie</b>¹ | |
| </p> | |
| <p class="affiliations"> | |
| <sup>1</sup> Northwestern Polytechnical University · <sup>2</sup> Giant Network | |
| </p> | |
| </div> | |
| """ | |
| DISCLAIMER_HTML = """ | |
| <div id="disclaimer"> | |
| <strong>Disclaimer / 免责声明</strong><br> | |
| YingMusic-Singer enables the creation of singing voices with modified lyrics, supporting applications | |
| in artistic creation and entertainment. Potential risks include unauthorized voice cloning and copyright | |
| infringement. To ensure responsible deployment, users should obtain consent for voice usage, disclose | |
| AI involvement, and verify musical originality.<br> | |
| <span style="opacity:0.75;">YingMusic-Singer 可用于修改歌词后的歌声合成,支持艺术创作与娱乐应用。潜在风险包括未经授权的声音克隆与版权侵权。为确保负责任地使用,用户应在使用他人声音前获得授权、公开 AI 参与情况,并确认音乐内容的原创性。</span> | |
| </div> | |
| """ | |
| # --------------------------------------------------------------------------- | |
| # Build the Gradio UI / 构建界面 | |
| # --------------------------------------------------------------------------- | |
| def build_ui(): | |
| # Shared input components referenced by both Examples tabs and main form | |
| ALL_INPUTS_ORDER = None # defined after components are created | |
| with gr.Blocks( | |
| css=CUSTOM_CSS, title="YingMusic Singer", theme=gr.themes.Base() | |
| ) as demo: | |
| # ---- Header / 头部 ---- | |
| gr.HTML(HEADER_HTML) | |
| gr.HTML("<hr style='border-color:#30363d; margin: 8px 0 18px;'>") | |
| # ================================================================ | |
| # ROW 1 – 音频输入 / Audio Inputs + 歌词 / Lyrics (side by side) | |
| # ================================================================ | |
| with gr.Row(equal_height=True): | |
| # ---- 左栏:音频上传 / Left column: audio uploads ---- | |
| with gr.Column(scale=1): | |
| gr.Markdown( | |
| "#### 🎙️ 音频输入 / Audio Inputs", elem_classes="section-title" | |
| ) | |
| ref_audio = gr.Audio( | |
| label="参考音频 / Reference Audio(提供音色 / provides timbre)", | |
| type="filepath", | |
| ) | |
| melody_audio = gr.Audio( | |
| label="旋律音频 / Melody Audio(提供旋律与时长 / provides melody & duration)", | |
| type="filepath", | |
| ) | |
| # ---- 右栏:歌词 / Right column: lyrics ---- | |
| with gr.Column(scale=1): | |
| gr.Markdown("#### ✏️ 歌词 / Lyrics", elem_classes="section-title") | |
| ref_text = gr.Textbox( | |
| label="参考音频歌词 / Reference Text", | |
| placeholder="例如 / e.g.:该体谅的不执着|如果那天我", | |
| lines=5, | |
| ) | |
| target_text = gr.Textbox( | |
| label="目标合成歌词 / Target Text", | |
| placeholder="例如 / e.g.:好多天|看不完你", | |
| lines=5, | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Build the Gradio UI / 构建界面 | |
| # --------------------------------------------------------------------------- | |
| def build_ui(): | |
| with gr.Blocks( | |
| css=CUSTOM_CSS, title="YingMusic Singer", theme=gr.themes.Base() | |
| ) as demo: | |
| # ---- Header ---- | |
| gr.HTML(HEADER_HTML) | |
| gr.HTML("<hr style='border-color:#30363d; margin: 8px 0 18px;'>") | |
| # ================================================================ | |
| # ROW 1 – 音频输入 + 歌词 | |
| # ================================================================ | |
| with gr.Row(equal_height=True): | |
| with gr.Column(scale=1): | |
| gr.Markdown("#### 🎙️ 音频输入 / Audio Inputs", elem_classes="section-title") | |
| ref_audio = gr.Audio( | |
| label="参考音频 / Reference Audio(提供音色 / provides timbre)", | |
| type="filepath", | |
| ) | |
| melody_audio = gr.Audio( | |
| label="旋律音频 / Melody Audio(提供旋律与时长 / provides melody & duration)", | |
| type="filepath", | |
| ) | |
| with gr.Column(scale=1): | |
| gr.Markdown("#### ✏️ 歌词 / Lyrics", elem_classes="section-title") | |
| ref_text = gr.Textbox( | |
| label="参考音频歌词 / Reference Text", | |
| placeholder="例如 / e.g.:该体谅的不执着|如果那天我", | |
| lines=5, | |
| ) | |
| target_text = gr.Textbox( | |
| label="目标合成歌词 / Target Text", | |
| placeholder="例如 / e.g.:好多天|看不完你", | |
| lines=5, | |
| ) | |
| # ================================================================ | |
| # ROW 2 – 预设示例 / Example Presets ← before vocal separation | |
| # ================================================================ | |
| gr.HTML("<hr style='border-color:#30363d; margin: 16px 0 12px;'>") | |
| gr.Markdown("#### 🎵 预设示例 / Example Presets", elem_classes="section-title") | |
| gr.Markdown( | |
| "<small style='color:#8b949e;'>点击任意行自动填入输入区域 / Click any row to auto-fill the inputs above</small>" | |
| ) | |
| # Hidden advanced-param components so gr.Examples can reference them | |
| # (real sliders rendered inside the accordion below override these values) | |
| with gr.Row(visible=False): | |
| _sep_flag_ex = gr.Checkbox(value=True) | |
| _mix_flag_ex = gr.Checkbox(value=False) | |
| _sil_ex = gr.Number(value=0.5) | |
| _tshift_ex = gr.Number(value=0.5) | |
| _nfe_ex = gr.Number(value=32) | |
| _cfg_ex = gr.Number(value=3.0) | |
| _seed_ex = gr.Number(value=-1, precision=0) | |
| _example_inputs = [ | |
| ref_audio, melody_audio, ref_text, target_text, | |
| _sep_flag_ex, _mix_flag_ex, | |
| _sil_ex, _tshift_ex, _nfe_ex, _cfg_ex, _seed_ex, | |
| ] | |
| with gr.Tabs(): | |
| with gr.Tab("🎼 Melody Control"): | |
| gr.Examples( | |
| examples=EXAMPLES_MELODY_CONTROL, | |
| inputs=_example_inputs, | |
| label="Melody Control Examples", | |
| examples_per_page=5, | |
| ) | |
| with gr.Tab("✏️ Lyric Edit"): | |
| gr.Examples( | |
| examples=EXAMPLES_LYRIC_EDIT, | |
| inputs=_example_inputs, | |
| label="Lyric Edit Examples", | |
| examples_per_page=5, | |
| ) | |
| # ================================================================ | |
| # ROW 3 – 伴奏分离 / Vocal Separation | |
| # ================================================================ | |
| gr.HTML("<hr style='border-color:#30363d; margin: 16px 0 12px;'>") | |
| gr.Markdown("#### 🎚️ 伴奏分离 / Vocal Separation", elem_classes="section-title") | |
| with gr.Row(): | |
| separate_vocals_flag = gr.Checkbox( | |
| value=True, | |
| label="分离人声后过模型 / Separate vocals before synthesis", | |
| info=( | |
| "从参考音频和旋律音频中分离人声,仅用人声送入模型 / " | |
| "Extract vocals from both reference and melody audio before feeding into the model" | |
| ), | |
| ) | |
| mix_accompaniment_flag = gr.Checkbox( | |
| value=False, | |
| interactive=True, | |
| label="输出时混入伴奏 / Mix accompaniment into output", | |
| info=( | |
| "将合成人声与分离出的伴奏混合输出(需先开启人声分离)/ " | |
| "Mix synthesised vocals with separated accompaniment (requires vocal separation)" | |
| ), | |
| ) | |
| # ================================================================ | |
| # ROW 4 – 高级参数 / Advanced Parameters (collapsible) | |
| # ================================================================ | |
| with gr.Accordion("⚙️ 高级参数 / Advanced Parameters", open=False): | |
| with gr.Row(): | |
| nfe_step = gr.Slider( | |
| minimum=4, maximum=128, value=32, step=1, | |
| label="采样步数 / NFE Steps", | |
| info="更多步数 = 更高质量,但更慢 / More steps = higher quality, but slower", | |
| ) | |
| cfg_strength = gr.Slider( | |
| minimum=0.0, maximum=10.0, value=3.0, step=0.1, | |
| label="CFG 强度 / CFG Strength", | |
| ) | |
| t_shift = gr.Slider( | |
| minimum=0.0, maximum=1.0, value=0.5, step=0.01, | |
| label="采样时间偏移 / t‑shift", | |
| ) | |
| with gr.Row(): | |
| sil_len_to_end = gr.Slider( | |
| minimum=0.0, maximum=3.0, value=0.5, step=0.1, | |
| label="末尾静音时长 (秒) / Silence Padding (s)", | |
| ) | |
| seed = gr.Number( | |
| value=-1, precision=0, | |
| label="随机种子 / Random Seed", | |
| info="-1 表示随机 / -1 means random", | |
| ) | |
| # ================================================================ | |
| # ROW 5 – 合成按钮与输出 / Run & Output | |
| # ================================================================ | |
| gr.HTML("<hr style='border-color:#30363d; margin: 12px 0;'>") | |
| run_btn = gr.Button("🎤 开始合成 / Synthesize", elem_id="run-btn", size="lg") | |
| output_audio = gr.Audio( | |
| label="合成结果 / Generated Audio", | |
| type="filepath", | |
| elem_id="output-audio", | |
| ) | |
| # All inputs for the synthesize() call (uses real sliders, not example placeholders) | |
| _all_inputs = [ | |
| ref_audio, melody_audio, ref_text, target_text, | |
| separate_vocals_flag, mix_accompaniment_flag, | |
| sil_len_to_end, t_shift, nfe_step, cfg_strength, seed, | |
| ] | |
| # ================================================================ | |
| # Event wiring / 事件绑定 | |
| # ================================================================ | |
| separate_vocals_flag.change( | |
| fn=lambda sep: gr.update(interactive=sep, value=False), | |
| inputs=[separate_vocals_flag], | |
| outputs=[mix_accompaniment_flag], | |
| ) | |
| run_btn.click( | |
| fn=synthesize, | |
| inputs=_all_inputs, | |
| outputs=output_audio, | |
| ) | |
| # ---- 页脚:免责声明 / Footer: disclaimer ---- | |
| gr.HTML(DISCLAIMER_HTML) | |
| return demo | |
| # --------------------------------------------------------------------------- | |
| # Entry point / 启动入口 | |
| # --------------------------------------------------------------------------- | |
| if __name__ == "__main__": | |
| demo = build_ui() | |
| demo.queue() | |
| demo.launch(server_name="0.0.0.0", server_port=7860, share=False) |