import argparse import os import shutil import subprocess import sys import tempfile from functools import partial import gradio as gr import librosa import numpy as np import soundfile from edgetts.tts_voices import SUPPORTED_LANGUAGES from inference.infer_tool import Svc MAXOCTAVE = 2 TEMPDIR = None def generate_tempfile(suffix=None, prefix=None): global TEMPDIR _, filepath = tempfile.mkstemp(suffix=suffix, prefix=prefix, dir=TEMPDIR) return filepath def find_sovits_model(dirpath): for filename in os.listdir(dirpath): if filename.endswith(".pth"): return os.path.join(dirpath, filename) return None def find_diffusion_model(dirpath): for filename in os.listdir(dirpath): if filename.startswith("model") and filename.endswith(".pt"): return os.path.join(dirpath, filename) return None def find_static_file(dirpath, filename): filepath = os.path.join(dirpath, filename) return filepath if os.path.exists(filepath) else None def model_fn(modeldir, model, leakctrl, diffonly, enhancer): if model is not None: model.unload_model() # locate trained models sovits_model_path = find_sovits_model(modeldir) sovits_config_path = find_static_file(modeldir, "config.json") diffusion_model_path = find_diffusion_model(modeldir) diffusion_config_path = find_static_file(modeldir, "config.yaml") kmeans_model_path = find_static_file(modeldir, "kmeans_10000.pt") feature_index_path = find_static_file(modeldir, "feature_and_index.pkl") feature_retrieval = leakctrl == "Feature retrieval" cluster_model_path = feature_index_path if feature_retrieval else kmeans_model_path model = Svc( sovits_model_path, sovits_config_path, cluster_model_path=cluster_model_path, feature_retrieval=feature_retrieval, diffusion_model_path=diffusion_model_path, diffusion_config_path=diffusion_config_path, shallow_diffusion=True, only_diffusion=diffonly, nsf_hifigan_enhance=enhancer, ) speakers = list(model.spk2id.keys()) return ( model, "Reload Model", f"Successfully loaded model into device {str(model.dev)}", gr.Dropdown(choices=speakers, value=speakers[0]), ) def preset_fn(preset): if preset == "Singing": f0_predictor = "none" leakctrl_ratio = 0.5 else: f0_predictor = "rmvpe" leakctrl_ratio = 0 """ f0_predictor, pitch_shift, leakctrl_ratio, diff_steps, noise_scale, silent_padding, db_threshold, auto_clip, clip_overlap, cross_fade, adaptive_key, crepe_f0, loudness_ratio, reencode_audio, """ return ( f0_predictor, 0, leakctrl_ratio, 100, 0.4, 0.5, -40, 0, 0, 0.75, 0, 0.05, 0, False, ) def tts_fn(text, gender, lang, rate, volume): def to_percent(x): return f"+{int(x * 100)}%" if x >= 0 else f"{int(x * 100)}%" rate = to_percent(rate) volume = to_percent(volume) outfile = generate_tempfile(suffix=".wav") subprocess.run([sys.executable, "edgetts/tts.py", text, lang, rate, volume, gender, outfile]) result, orig_sr = librosa.load(outfile) os.remove(outfile) target_sr = 44100 resampled = librosa.resample(result, orig_sr=orig_sr, target_sr=target_sr) return target_sr, resampled def inference_fn( model, speaker, input_audio, f0_predictor, pitch_shift, leakctrl_ratio, diff_steps, noise_scale, silent_padding, db_threshold, auto_clip, clip_overlap, cross_fade, adaptive_key, crepe_f0, loudness_ratio, reencode_audio, ): if model is None: return "Error: please load model first", None if input_audio is None: return "Error: please upload an audio", None sample_rate, audio = input_audio if np.issubdtype(audio.dtype, np.integer): audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32) if len(audio.shape) > 1: audio = librosa.to_mono(audio.transpose(1, 0)) infile = generate_tempfile(suffix=".wav") soundfile.write(infile, audio, sample_rate, format="wav") result = model.slice_inference( infile, speaker, pitch_shift, db_threshold, leakctrl_ratio, f0_predictor != "none", noise_scale, pad_seconds=silent_padding, clip_seconds=auto_clip, lg_num=clip_overlap, lgr_num=cross_fade, f0_predictor="crepe" if f0_predictor == "none" else f0_predictor, enhancer_adaptive_key=adaptive_key, cr_threshold=crepe_f0, k_step=diff_steps, use_spk_mix=False, second_encoding=reencode_audio, loudness_envelope_adjustment=loudness_ratio, ) model.clear_empty() os.remove(infile) # gr.Audio force normalize the audio if supplied as a numpy array # we must write to a temporary file and return the filepath here prefix = f"{speaker}_{f0_predictor}_pitch{pitch_shift}_timbre{leakctrl_ratio}_diff{diff_steps}_" outfile = generate_tempfile(suffix=".wav", prefix=prefix) soundfile.write(outfile, result, model.target_sample, format="wav") return "Success", outfile if __name__ == "__main__": parser = argparse.ArgumentParser(description="so-vits-svc WebUI") parser.add_argument("-m", "--model", default="./trained") parser.add_argument("-t", "--temp", default="./workspace") args = parser.parse_args() shutil.rmtree(args.temp, ignore_errors=True) os.makedirs(args.temp, exist_ok=True) TEMPDIR = args.temp with gr.Blocks() as app: with gr.Row(): with gr.Column(): title = gr.Markdown(value="""# AI Sora Singing Voice Conversion""") with gr.Column(): with gr.Accordion(label="About", open=False): about = gr.Markdown(value="""Space by [KasugaiSakura](https://huggingface.co/KasugaiSakura)
Based on a modified version of [so-vits-svc](https://github.com/meimisaki/so-vits-svc/tree/4.1-Stable)
Voice copyright belongs to [CUFFS/Sphere](https://www.cuffs.co.jp/)""") with gr.Row(): with gr.Column(): with gr.Accordion(label="Model setup", open=True): leakctrl = gr.Radio( label="Timbre leakage control method", choices=["Feature retrieval", "K-means clustering"], value="Feature retrieval", ) diffonly = gr.Checkbox(label="Diffusion only mode") enhancer = gr.Checkbox(label="NSF-HiFiGAN enhancer (not recommended)") modelptr = gr.State(None) modelbtn = gr.Button(value="Load Model", variant="primary") modelmsg = gr.Textbox(label="Model info") speaker = gr.Dropdown(label="Speaker", interactive=True) with gr.Accordion(label="Text to speech", open=False): tts_text = gr.Textbox(label="Text", placeholder="Enter text here") tts_gender = gr.Radio(label="Gender", choices=["Male","Female"], value="Male") tts_lang = gr.Dropdown(label="Language", choices=SUPPORTED_LANGUAGES, value="Auto") tts_rate = gr.Slider( label="Relative speed", minimum=-1, maximum=3, value=0, step=0.1 ) tts_volume = gr.Slider( label="Relative volume", minimum=-1, maximum=1.5, value=0, step=0.1 ) tts_btn = gr.Button(value="Synthesize") with gr.Accordion(label="Voice conversion", open=True): input_audio = gr.Audio(label="Input audio", type="numpy") inference_btn = gr.Button(value="Inference") output_msg = gr.Textbox(label="Output message") output_audio = gr.Audio(label="Output audio", type="filepath") with gr.Column(): with gr.Accordion(label="Inference options", open=True): inference_preset = gr.Radio( label="Preset", choices=["Singing", "Speaking"], value="Singing", interactive=True, ) f0_predictor = gr.Dropdown( label="F0 predictor", choices=["none", "crepe", "dio", "harvest", "pm", "rmvpe"], value="none", ) pitch_shift = gr.Slider( label="Pitch shift (in semitones, 12 in an octave)", minimum=-12*MAXOCTAVE, maximum=12*MAXOCTAVE, value=0, step=1, ) leakctrl_ratio = gr.Slider( label="Timbre leakage control mix ratio (set to 0 to disable it)", minimum=0, maximum=1, value=0.5, step=0.1, ) diff_steps = gr.Slider( label="Shallow diffusion steps", minimum=0, maximum=1000, value=100, step=10, ) noise_scale = gr.Slider( label="Noise scale (try NOT to modify this parameter)", minimum=0, maximum=1, value=0.4, step=0.01, ) silent_padding = gr.Slider( label="Add silent padding to workaround noise caused by unknown reason (in seconds)", minimum=0, maximum=3, value=0.5, step=0.01, ) db_threshold = gr.Slider( label="Silence dB threshold (for slicing audio into chunks)", minimum=-100, maximum=0, value=-40, step=1, ) auto_clip = gr.Slider( label="Apply auto clip to reduce memory consumption (in seconds)", minimum=0, maximum=100, value=0, step=1, ) clip_overlap = gr.Slider( label="Overlap duration between auto clips (in seconds)", minimum=0, maximum=3, value=0, step=0.01, ) cross_fade = gr.Slider( label="Cross fade ratio of overlapping regions", minimum=0, maximum=1, value=0.75, step=0.01, ) adaptive_key = gr.Slider( label="Enhancer adaptive key (in semitones, 12 in an octave)", minimum=-12*MAXOCTAVE, maximum=12*MAXOCTAVE, value=0, step=1, ) crepe_f0 = gr.Slider( label="CREPE F0 threshold (increase to reduce noise but may result in out-of-tune)", minimum=0, maximum=1, value=0.05, step=0.01, ) loudness_ratio = gr.Slider( label="Loudness envelope mix ratio of input and output (0 is input and 1 is output)", minimum=0, maximum=1, value=0, step=0.01, ) reencode_audio = gr.Checkbox( label="Re-encode audio before shallow diffusion, with unknown impact on final result" ) modelbtn.click( partial(model_fn, args.model), inputs=[modelptr, leakctrl, diffonly, enhancer], outputs=[modelptr, modelbtn, modelmsg, speaker], ) inference_preset.change( preset_fn, inputs=[inference_preset], outputs=[ f0_predictor, pitch_shift, leakctrl_ratio, diff_steps, noise_scale, silent_padding, db_threshold, auto_clip, clip_overlap, cross_fade, adaptive_key, crepe_f0, loudness_ratio, reencode_audio, ], ) tts_btn.click( tts_fn, inputs=[tts_text, tts_gender, tts_lang, tts_rate, tts_volume], outputs=[input_audio], ) inference_btn.click( inference_fn, inputs=[ modelptr, speaker, input_audio, f0_predictor, pitch_shift, leakctrl_ratio, diff_steps, noise_scale, silent_padding, db_threshold, auto_clip, clip_overlap, cross_fade, adaptive_key, crepe_f0, loudness_ratio, reencode_audio, ], outputs=[output_msg, output_audio], ) app.launch(debug=True, share=True)