Chitsanfei
fix: information
327a696
import io
import os
# os.system("wget -P cvec/ https://huggingface.co/spaces/innnky/nanami/resolve/main/checkpoint_best_legacy_500.pt")
import gradio as gr
import librosa
import numpy as np
import soundfile
from inference.infer_tool import Svc
import logging
logging.getLogger('numba').setLevel(logging.WARNING)
logging.getLogger('markdown_it').setLevel(logging.WARNING)
logging.getLogger('urllib3').setLevel(logging.WARNING)
logging.getLogger('matplotlib').setLevel(logging.WARNING)
config_path = "configs/config.json"
model = Svc("logs/44k/G_130400.pth", "configs/config.json", cluster_model_path="logs/44k/kmeans.pt")
def vc_fn(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale):
if input_audio is None:
return "You need to upload an audio", None
sampling_rate, audio = input_audio
# print(audio.shape,sampling_rate)
duration = audio.shape[0] / sampling_rate
if duration > 90:
return "请上传小于90s的音频,需要转换长音频请本地进行转换", None
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
if len(audio.shape) > 1:
audio = librosa.to_mono(audio.transpose(1, 0))
if sampling_rate != 16000:
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
print(audio.shape)
out_wav_path = "temp.wav"
soundfile.write(out_wav_path, audio, 16000, format="wav")
print( cluster_ratio, auto_f0, noise_scale)
_audio = model.slice_inference(out_wav_path, sid, vc_transform, slice_db, cluster_ratio, auto_f0, noise_scale)
return "Success", (44100, _audio)
app = gr.Blocks()
with app:
with gr.Tabs():
with gr.TabItem("Basic"):
gr.Markdown(value="""
# sovits-emu-voice-transform | 可以变成凤笑梦的在线变声器
[![Visitors](https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FMashiroSA%2Fsovits-emu-voice-transform&labelColor=%23f47373&countColor=%23555555)](https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FMashiroSA%2Fsovits-emu-voice-transform)
<br/>
**说明 / Introduction**
- 基于so-vits-svc 4.0的官方库示例修改而成。
- 该项目用于便携的基于云计算的变声成为Project Sekai的角色鳳えむ(凤笑梦)。
- 所使用的音声训练集基于对话而来,因而转换后的音声在对话表现中会比乐曲中的人声中要好。
- 该项目以无盈利模式进行。
- Modified from the official library example based on so-vits-svc 4.0.
- The sound training set used is based on dialogue, thus the converted sound will perform better in dialogue than the vocals in the music.
- This project is conducted in no-profit.
```text
For academic purpose only and not for illegal purposes. We have no relationship or interest with SEGA or related organizations.
The model derivation output is only similar to Otori Emu and there is inevitable loss, which cannot be fully simulated.
If you have any questions, please send an email or forum for inquiry.
```
<br/>
**如何使用**
- 如果用于日常说话时的对话转换,请提前录制一段低于90s的人声干声,上传,勾选下面的自动f0预测,其它的可以不用动,直接转换,过一会儿就能听到转换的声音了。
- 如果是乐曲中的人声,你可以使用自己的清唱,或者使用UVR5软件进行干声提取,上传,不要勾选自动f0预测,按情况进行变调(模型实际测试高于标准音C4的类似度较高,输入的干声是男声请+12,女声可以先不变),然后转换。
- 转换后的进度条右侧有个省略的点,在那边可以下载。
- 本repo的管理者 @MashiroSA 看不到你输入和输出后的内容,只有Hugging Face官方也许可以看到,请放心。
- 关于下面选项中的聚类模型的使用:默认为0,值是0-1,越高越能贴近模型音色,但会导致咬字不清。
""")
spks = list(model.spk2id.keys())
sid = gr.Dropdown(label="音色", choices=spks, value=spks[0])
vc_input3 = gr.Audio(label="上传音频(长度小于90秒)")
vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12,当你觉得音色不准确时可以适当调高或降低,当自动f0预测勾选后该项失效)", value=0)
cluster_ratio = gr.Number(label="聚类模型混合比例,0-1之间,默认为0不启用聚类,能提升音色相似度,但会导致咬字下降(如果使用建议0.5左右)", value=0)
auto_f0 = gr.Checkbox(label="自动f0预测,配合聚类模型f0预测效果更好,会导致变调功能失效(仅限转换语音,歌声不要勾选此项会究极跑调)", value=False)
slice_db = gr.Number(label="切片阈值", value=-40)
noise_scale = gr.Number(label="noise_scale 建议不要动,会影响音质,玄学参数", value=0.4)
vc_submit = gr.Button("转换", variant="primary")
vc_output1 = gr.Textbox(label="Output Message")
vc_output2 = gr.Audio(label="Output Audio")
vc_submit.click(vc_fn, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale], [vc_output1, vc_output2])
app.launch()