import argparse
import os
import shutil
import subprocess
import sys
import tempfile
from functools import partial
import gradio as gr
import librosa
import numpy as np
import soundfile
from edgetts.tts_voices import SUPPORTED_LANGUAGES
from inference.infer_tool import Svc
MAXOCTAVE = 2
TEMPDIR = None
def generate_tempfile(suffix=None, prefix=None):
global TEMPDIR
_, filepath = tempfile.mkstemp(suffix=suffix, prefix=prefix, dir=TEMPDIR)
return filepath
def find_sovits_model(dirpath):
for filename in os.listdir(dirpath):
if filename.endswith(".pth"):
return os.path.join(dirpath, filename)
return None
def find_diffusion_model(dirpath):
for filename in os.listdir(dirpath):
if filename.startswith("model") and filename.endswith(".pt"):
return os.path.join(dirpath, filename)
return None
def find_static_file(dirpath, filename):
filepath = os.path.join(dirpath, filename)
return filepath if os.path.exists(filepath) else None
def model_fn(modeldir, model, leakctrl, diffonly, enhancer):
if model is not None:
model.unload_model()
# locate trained models
sovits_model_path = find_sovits_model(modeldir)
sovits_config_path = find_static_file(modeldir, "config.json")
diffusion_model_path = find_diffusion_model(modeldir)
diffusion_config_path = find_static_file(modeldir, "config.yaml")
kmeans_model_path = find_static_file(modeldir, "kmeans_10000.pt")
feature_index_path = find_static_file(modeldir, "feature_and_index.pkl")
feature_retrieval = leakctrl == "Feature retrieval"
cluster_model_path = feature_index_path if feature_retrieval else kmeans_model_path
model = Svc(
sovits_model_path,
sovits_config_path,
cluster_model_path=cluster_model_path,
feature_retrieval=feature_retrieval,
diffusion_model_path=diffusion_model_path,
diffusion_config_path=diffusion_config_path,
shallow_diffusion=True,
only_diffusion=diffonly,
nsf_hifigan_enhance=enhancer,
)
speakers = list(model.spk2id.keys())
return (
model,
"Reload Model",
f"Successfully loaded model into device {str(model.dev)}",
gr.Dropdown(choices=speakers, value=speakers[0]),
)
def preset_fn(preset):
if preset == "Singing":
f0_predictor = "none"
leakctrl_ratio = 0.5
else:
f0_predictor = "rmvpe"
leakctrl_ratio = 0
"""
f0_predictor, pitch_shift, leakctrl_ratio, diff_steps, noise_scale,
silent_padding, db_threshold, auto_clip, clip_overlap, cross_fade,
adaptive_key, crepe_f0, loudness_ratio, reencode_audio,
"""
return (
f0_predictor, 0, leakctrl_ratio, 100, 0.4,
0.5, -40, 0, 0, 0.75,
0, 0.05, 0, False,
)
def tts_fn(text, gender, lang, rate, volume):
def to_percent(x):
return f"+{int(x * 100)}%" if x >= 0 else f"{int(x * 100)}%"
rate = to_percent(rate)
volume = to_percent(volume)
outfile = generate_tempfile(suffix=".wav")
subprocess.run([sys.executable, "edgetts/tts.py", text, lang, rate, volume, gender, outfile])
result, orig_sr = librosa.load(outfile)
os.remove(outfile)
target_sr = 44100
resampled = librosa.resample(result, orig_sr=orig_sr, target_sr=target_sr)
return target_sr, resampled
def inference_fn(
model, speaker, input_audio,
f0_predictor, pitch_shift, leakctrl_ratio, diff_steps, noise_scale,
silent_padding, db_threshold, auto_clip, clip_overlap, cross_fade,
adaptive_key, crepe_f0, loudness_ratio, reencode_audio,
):
if model is None:
return "Error: please load model first", None
if input_audio is None:
return "Error: please upload an audio", None
sample_rate, audio = input_audio
if np.issubdtype(audio.dtype, np.integer):
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
if len(audio.shape) > 1:
audio = librosa.to_mono(audio.transpose(1, 0))
infile = generate_tempfile(suffix=".wav")
soundfile.write(infile, audio, sample_rate, format="wav")
result = model.slice_inference(
infile,
speaker,
pitch_shift,
db_threshold,
leakctrl_ratio,
f0_predictor != "none",
noise_scale,
pad_seconds=silent_padding,
clip_seconds=auto_clip,
lg_num=clip_overlap,
lgr_num=cross_fade,
f0_predictor="crepe" if f0_predictor == "none" else f0_predictor,
enhancer_adaptive_key=adaptive_key,
cr_threshold=crepe_f0,
k_step=diff_steps,
use_spk_mix=False,
second_encoding=reencode_audio,
loudness_envelope_adjustment=loudness_ratio,
)
model.clear_empty()
os.remove(infile)
# gr.Audio force normalize the audio if supplied as a numpy array
# we must write to a temporary file and return the filepath here
prefix = f"{speaker}_{f0_predictor}_pitch{pitch_shift}_timbre{leakctrl_ratio}_diff{diff_steps}_"
outfile = generate_tempfile(suffix=".wav", prefix=prefix)
soundfile.write(outfile, result, model.target_sample, format="wav")
return "Success", outfile
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="so-vits-svc WebUI")
parser.add_argument("-m", "--model", default="./trained")
parser.add_argument("-t", "--temp", default="./workspace")
args = parser.parse_args()
shutil.rmtree(args.temp, ignore_errors=True)
os.makedirs(args.temp, exist_ok=True)
TEMPDIR = args.temp
with gr.Blocks() as app:
with gr.Row():
with gr.Column():
title = gr.Markdown(value="""# AI Sora Singing Voice Conversion""")
with gr.Column():
with gr.Accordion(label="About", open=False):
about = gr.Markdown(value="""Space by [KasugaiSakura](https://huggingface.co/KasugaiSakura)
Based on a modified version of [so-vits-svc](https://github.com/meimisaki/so-vits-svc/tree/4.1-Stable)
Voice copyright belongs to [CUFFS/Sphere](https://www.cuffs.co.jp/)""")
with gr.Row():
with gr.Column():
with gr.Accordion(label="Model setup", open=True):
leakctrl = gr.Radio(
label="Timbre leakage control method",
choices=["Feature retrieval", "K-means clustering"],
value="Feature retrieval",
)
diffonly = gr.Checkbox(label="Diffusion only mode")
enhancer = gr.Checkbox(label="NSF-HiFiGAN enhancer (not recommended)")
modelptr = gr.State(None)
modelbtn = gr.Button(value="Load Model", variant="primary")
modelmsg = gr.Textbox(label="Model info")
speaker = gr.Dropdown(label="Speaker", interactive=True)
with gr.Accordion(label="Text to speech", open=False):
tts_text = gr.Textbox(label="Text", placeholder="Enter text here")
tts_gender = gr.Radio(label="Gender", choices=["Male","Female"], value="Male")
tts_lang = gr.Dropdown(label="Language", choices=SUPPORTED_LANGUAGES, value="Auto")
tts_rate = gr.Slider(
label="Relative speed",
minimum=-1, maximum=3, value=0, step=0.1
)
tts_volume = gr.Slider(
label="Relative volume",
minimum=-1, maximum=1.5, value=0, step=0.1
)
tts_btn = gr.Button(value="Synthesize")
with gr.Accordion(label="Voice conversion", open=True):
input_audio = gr.Audio(label="Input audio", type="numpy")
inference_btn = gr.Button(value="Inference")
output_msg = gr.Textbox(label="Output message")
output_audio = gr.Audio(label="Output audio", type="filepath")
with gr.Column():
with gr.Accordion(label="Inference options", open=True):
inference_preset = gr.Radio(
label="Preset",
choices=["Singing", "Speaking"],
value="Singing",
interactive=True,
)
f0_predictor = gr.Dropdown(
label="F0 predictor",
choices=["none", "crepe", "dio", "harvest", "pm", "rmvpe"],
value="none",
)
pitch_shift = gr.Slider(
label="Pitch shift (in semitones, 12 in an octave)",
minimum=-12*MAXOCTAVE, maximum=12*MAXOCTAVE, value=0, step=1,
)
leakctrl_ratio = gr.Slider(
label="Timbre leakage control mix ratio (set to 0 to disable it)",
minimum=0, maximum=1, value=0.5, step=0.1,
)
diff_steps = gr.Slider(
label="Shallow diffusion steps",
minimum=0, maximum=1000, value=100, step=10,
)
noise_scale = gr.Slider(
label="Noise scale (try NOT to modify this parameter)",
minimum=0, maximum=1, value=0.4, step=0.01,
)
silent_padding = gr.Slider(
label="Add silent padding to workaround noise caused by unknown reason (in seconds)",
minimum=0, maximum=3, value=0.5, step=0.01,
)
db_threshold = gr.Slider(
label="Silence dB threshold (for slicing audio into chunks)",
minimum=-100, maximum=0, value=-40, step=1,
)
auto_clip = gr.Slider(
label="Apply auto clip to reduce memory consumption (in seconds)",
minimum=0, maximum=100, value=0, step=1,
)
clip_overlap = gr.Slider(
label="Overlap duration between auto clips (in seconds)",
minimum=0, maximum=3, value=0, step=0.01,
)
cross_fade = gr.Slider(
label="Cross fade ratio of overlapping regions",
minimum=0, maximum=1, value=0.75, step=0.01,
)
adaptive_key = gr.Slider(
label="Enhancer adaptive key (in semitones, 12 in an octave)",
minimum=-12*MAXOCTAVE, maximum=12*MAXOCTAVE, value=0, step=1,
)
crepe_f0 = gr.Slider(
label="CREPE F0 threshold (increase to reduce noise but may result in out-of-tune)",
minimum=0, maximum=1, value=0.05, step=0.01,
)
loudness_ratio = gr.Slider(
label="Loudness envelope mix ratio of input and output (0 is input and 1 is output)",
minimum=0, maximum=1, value=0, step=0.01,
)
reencode_audio = gr.Checkbox(
label="Re-encode audio before shallow diffusion, with unknown impact on final result"
)
modelbtn.click(
partial(model_fn, args.model),
inputs=[modelptr, leakctrl, diffonly, enhancer],
outputs=[modelptr, modelbtn, modelmsg, speaker],
)
inference_preset.change(
preset_fn,
inputs=[inference_preset],
outputs=[
f0_predictor, pitch_shift, leakctrl_ratio, diff_steps, noise_scale,
silent_padding, db_threshold, auto_clip, clip_overlap, cross_fade,
adaptive_key, crepe_f0, loudness_ratio, reencode_audio,
],
)
tts_btn.click(
tts_fn,
inputs=[tts_text, tts_gender, tts_lang, tts_rate, tts_volume],
outputs=[input_audio],
)
inference_btn.click(
inference_fn,
inputs=[
modelptr, speaker, input_audio,
f0_predictor, pitch_shift, leakctrl_ratio, diff_steps, noise_scale,
silent_padding, db_threshold, auto_clip, clip_overlap, cross_fade,
adaptive_key, crepe_f0, loudness_ratio, reencode_audio,
],
outputs=[output_msg, output_audio],
)
app.launch(debug=True, share=True)