Spaces:
Runtime error
Runtime error
import argparse | |
import os | |
import shutil | |
import subprocess | |
import sys | |
import tempfile | |
from functools import partial | |
import gradio as gr | |
import librosa | |
import numpy as np | |
import soundfile | |
from edgetts.tts_voices import SUPPORTED_LANGUAGES | |
from inference.infer_tool import Svc | |
MAXOCTAVE = 2 | |
TEMPDIR = None | |
def generate_tempfile(suffix=None, prefix=None): | |
global TEMPDIR | |
_, filepath = tempfile.mkstemp(suffix=suffix, prefix=prefix, dir=TEMPDIR) | |
return filepath | |
def find_sovits_model(dirpath): | |
for filename in os.listdir(dirpath): | |
if filename.endswith(".pth"): | |
return os.path.join(dirpath, filename) | |
return None | |
def find_diffusion_model(dirpath): | |
for filename in os.listdir(dirpath): | |
if filename.startswith("model") and filename.endswith(".pt"): | |
return os.path.join(dirpath, filename) | |
return None | |
def find_static_file(dirpath, filename): | |
filepath = os.path.join(dirpath, filename) | |
return filepath if os.path.exists(filepath) else None | |
def model_fn(modeldir, model, leakctrl, diffonly, enhancer): | |
if model is not None: | |
model.unload_model() | |
# locate trained models | |
sovits_model_path = find_sovits_model(modeldir) | |
sovits_config_path = find_static_file(modeldir, "config.json") | |
diffusion_model_path = find_diffusion_model(modeldir) | |
diffusion_config_path = find_static_file(modeldir, "config.yaml") | |
kmeans_model_path = find_static_file(modeldir, "kmeans_10000.pt") | |
feature_index_path = find_static_file(modeldir, "feature_and_index.pkl") | |
feature_retrieval = leakctrl == "Feature retrieval" | |
cluster_model_path = feature_index_path if feature_retrieval else kmeans_model_path | |
model = Svc( | |
sovits_model_path, | |
sovits_config_path, | |
cluster_model_path=cluster_model_path, | |
feature_retrieval=feature_retrieval, | |
diffusion_model_path=diffusion_model_path, | |
diffusion_config_path=diffusion_config_path, | |
shallow_diffusion=True, | |
only_diffusion=diffonly, | |
nsf_hifigan_enhance=enhancer, | |
) | |
speakers = list(model.spk2id.keys()) | |
return ( | |
model, | |
"Reload Model", | |
f"Successfully loaded model into device {str(model.dev)}", | |
gr.Dropdown(choices=speakers, value=speakers[0]), | |
) | |
def preset_fn(preset): | |
if preset == "Singing": | |
f0_predictor = "none" | |
leakctrl_ratio = 0.5 | |
else: | |
f0_predictor = "rmvpe" | |
leakctrl_ratio = 0 | |
""" | |
f0_predictor, pitch_shift, leakctrl_ratio, diff_steps, noise_scale, | |
silent_padding, db_threshold, auto_clip, clip_overlap, cross_fade, | |
adaptive_key, crepe_f0, loudness_ratio, reencode_audio, | |
""" | |
return ( | |
f0_predictor, 0, leakctrl_ratio, 100, 0.4, | |
0.5, -40, 0, 0, 0.75, | |
0, 0.05, 0, False, | |
) | |
def tts_fn(text, gender, lang, rate, volume): | |
def to_percent(x): | |
return f"+{int(x * 100)}%" if x >= 0 else f"{int(x * 100)}%" | |
rate = to_percent(rate) | |
volume = to_percent(volume) | |
outfile = generate_tempfile(suffix=".wav") | |
subprocess.run([sys.executable, "edgetts/tts.py", text, lang, rate, volume, gender, outfile]) | |
result, orig_sr = librosa.load(outfile) | |
os.remove(outfile) | |
target_sr = 44100 | |
resampled = librosa.resample(result, orig_sr=orig_sr, target_sr=target_sr) | |
return target_sr, resampled | |
def inference_fn( | |
model, speaker, input_audio, | |
f0_predictor, pitch_shift, leakctrl_ratio, diff_steps, noise_scale, | |
silent_padding, db_threshold, auto_clip, clip_overlap, cross_fade, | |
adaptive_key, crepe_f0, loudness_ratio, reencode_audio, | |
): | |
if model is None: | |
return "Error: please load model first", None | |
if input_audio is None: | |
return "Error: please upload an audio", None | |
sample_rate, audio = input_audio | |
if np.issubdtype(audio.dtype, np.integer): | |
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32) | |
if len(audio.shape) > 1: | |
audio = librosa.to_mono(audio.transpose(1, 0)) | |
infile = generate_tempfile(suffix=".wav") | |
soundfile.write(infile, audio, sample_rate, format="wav") | |
result = model.slice_inference( | |
infile, | |
speaker, | |
pitch_shift, | |
db_threshold, | |
leakctrl_ratio, | |
f0_predictor != "none", | |
noise_scale, | |
pad_seconds=silent_padding, | |
clip_seconds=auto_clip, | |
lg_num=clip_overlap, | |
lgr_num=cross_fade, | |
f0_predictor="crepe" if f0_predictor == "none" else f0_predictor, | |
enhancer_adaptive_key=adaptive_key, | |
cr_threshold=crepe_f0, | |
k_step=diff_steps, | |
use_spk_mix=False, | |
second_encoding=reencode_audio, | |
loudness_envelope_adjustment=loudness_ratio, | |
) | |
model.clear_empty() | |
os.remove(infile) | |
# gr.Audio force normalize the audio if supplied as a numpy array | |
# we must write to a temporary file and return the filepath here | |
prefix = f"{speaker}_{f0_predictor}_pitch{pitch_shift}_timbre{leakctrl_ratio}_diff{diff_steps}_" | |
outfile = generate_tempfile(suffix=".wav", prefix=prefix) | |
soundfile.write(outfile, result, model.target_sample, format="wav") | |
return "Success", outfile | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="so-vits-svc WebUI") | |
parser.add_argument("-m", "--model", default="./trained") | |
parser.add_argument("-t", "--temp", default="./workspace") | |
args = parser.parse_args() | |
shutil.rmtree(args.temp, ignore_errors=True) | |
os.makedirs(args.temp, exist_ok=True) | |
TEMPDIR = args.temp | |
with gr.Blocks() as app: | |
with gr.Row(): | |
with gr.Column(): | |
title = gr.Markdown(value="""# AI Sora Singing Voice Conversion""") | |
with gr.Column(): | |
with gr.Accordion(label="About", open=False): | |
about = gr.Markdown(value="""Space by [KasugaiSakura](https://huggingface.co/KasugaiSakura)<br/>Based on a modified version of [so-vits-svc](https://github.com/meimisaki/so-vits-svc/tree/4.1-Stable)<br/>Voice copyright belongs to [CUFFS/Sphere](https://www.cuffs.co.jp/)""") | |
with gr.Row(): | |
with gr.Column(): | |
with gr.Accordion(label="Model setup", open=True): | |
leakctrl = gr.Radio( | |
label="Timbre leakage control method", | |
choices=["Feature retrieval", "K-means clustering"], | |
value="Feature retrieval", | |
) | |
diffonly = gr.Checkbox(label="Diffusion only mode") | |
enhancer = gr.Checkbox(label="NSF-HiFiGAN enhancer (not recommended)") | |
modelptr = gr.State(None) | |
modelbtn = gr.Button(value="Load Model", variant="primary") | |
modelmsg = gr.Textbox(label="Model info") | |
speaker = gr.Dropdown(label="Speaker", interactive=True) | |
with gr.Accordion(label="Text to speech", open=False): | |
tts_text = gr.Textbox(label="Text", placeholder="Enter text here") | |
tts_gender = gr.Radio(label="Gender", choices=["Male","Female"], value="Male") | |
tts_lang = gr.Dropdown(label="Language", choices=SUPPORTED_LANGUAGES, value="Auto") | |
tts_rate = gr.Slider( | |
label="Relative speed", | |
minimum=-1, maximum=3, value=0, step=0.1 | |
) | |
tts_volume = gr.Slider( | |
label="Relative volume", | |
minimum=-1, maximum=1.5, value=0, step=0.1 | |
) | |
tts_btn = gr.Button(value="Synthesize") | |
with gr.Accordion(label="Voice conversion", open=True): | |
input_audio = gr.Audio(label="Input audio", type="numpy") | |
inference_btn = gr.Button(value="Inference") | |
output_msg = gr.Textbox(label="Output message") | |
output_audio = gr.Audio(label="Output audio", type="filepath") | |
with gr.Column(): | |
with gr.Accordion(label="Inference options", open=True): | |
inference_preset = gr.Radio( | |
label="Preset", | |
choices=["Singing", "Speaking"], | |
value="Singing", | |
interactive=True, | |
) | |
f0_predictor = gr.Dropdown( | |
label="F0 predictor", | |
choices=["none", "crepe", "dio", "harvest", "pm", "rmvpe"], | |
value="none", | |
) | |
pitch_shift = gr.Slider( | |
label="Pitch shift (in semitones, 12 in an octave)", | |
minimum=-12*MAXOCTAVE, maximum=12*MAXOCTAVE, value=0, step=1, | |
) | |
leakctrl_ratio = gr.Slider( | |
label="Timbre leakage control mix ratio (set to 0 to disable it)", | |
minimum=0, maximum=1, value=0.5, step=0.1, | |
) | |
diff_steps = gr.Slider( | |
label="Shallow diffusion steps", | |
minimum=0, maximum=1000, value=100, step=10, | |
) | |
noise_scale = gr.Slider( | |
label="Noise scale (try NOT to modify this parameter)", | |
minimum=0, maximum=1, value=0.4, step=0.01, | |
) | |
silent_padding = gr.Slider( | |
label="Add silent padding to workaround noise caused by unknown reason (in seconds)", | |
minimum=0, maximum=3, value=0.5, step=0.01, | |
) | |
db_threshold = gr.Slider( | |
label="Silence dB threshold (for slicing audio into chunks)", | |
minimum=-100, maximum=0, value=-40, step=1, | |
) | |
auto_clip = gr.Slider( | |
label="Apply auto clip to reduce memory consumption (in seconds)", | |
minimum=0, maximum=100, value=0, step=1, | |
) | |
clip_overlap = gr.Slider( | |
label="Overlap duration between auto clips (in seconds)", | |
minimum=0, maximum=3, value=0, step=0.01, | |
) | |
cross_fade = gr.Slider( | |
label="Cross fade ratio of overlapping regions", | |
minimum=0, maximum=1, value=0.75, step=0.01, | |
) | |
adaptive_key = gr.Slider( | |
label="Enhancer adaptive key (in semitones, 12 in an octave)", | |
minimum=-12*MAXOCTAVE, maximum=12*MAXOCTAVE, value=0, step=1, | |
) | |
crepe_f0 = gr.Slider( | |
label="CREPE F0 threshold (increase to reduce noise but may result in out-of-tune)", | |
minimum=0, maximum=1, value=0.05, step=0.01, | |
) | |
loudness_ratio = gr.Slider( | |
label="Loudness envelope mix ratio of input and output (0 is input and 1 is output)", | |
minimum=0, maximum=1, value=0, step=0.01, | |
) | |
reencode_audio = gr.Checkbox( | |
label="Re-encode audio before shallow diffusion, with unknown impact on final result" | |
) | |
modelbtn.click( | |
partial(model_fn, args.model), | |
inputs=[modelptr, leakctrl, diffonly, enhancer], | |
outputs=[modelptr, modelbtn, modelmsg, speaker], | |
) | |
inference_preset.change( | |
preset_fn, | |
inputs=[inference_preset], | |
outputs=[ | |
f0_predictor, pitch_shift, leakctrl_ratio, diff_steps, noise_scale, | |
silent_padding, db_threshold, auto_clip, clip_overlap, cross_fade, | |
adaptive_key, crepe_f0, loudness_ratio, reencode_audio, | |
], | |
) | |
tts_btn.click( | |
tts_fn, | |
inputs=[tts_text, tts_gender, tts_lang, tts_rate, tts_volume], | |
outputs=[input_audio], | |
) | |
inference_btn.click( | |
inference_fn, | |
inputs=[ | |
modelptr, speaker, input_audio, | |
f0_predictor, pitch_shift, leakctrl_ratio, diff_steps, noise_scale, | |
silent_padding, db_threshold, auto_clip, clip_overlap, cross_fade, | |
adaptive_key, crepe_f0, loudness_ratio, reencode_audio, | |
], | |
outputs=[output_msg, output_audio], | |
) | |
app.launch(debug=True, share=True) | |