Spaces:
Sleeping
Sleeping
Li Fang
commited on
Commit
·
d32a8bf
1
Parent(s):
5fdef9f
initial commit
Browse files- .gitignore +2 -0
- README.md +6 -6
- app.py +76 -0
- configs/44k/config.json +98 -0
- logs/44k/G_40000.pth +3 -0
- logs/44k/kmeans_10000.pt +3 -0
- requirements.txt +143 -0
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
venv/
|
2 |
+
.DS_Store
|
README.md
CHANGED
@@ -1,12 +1,12 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 3.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
11 |
|
12 |
-
|
|
|
1 |
---
|
2 |
+
title: Onlyswan官方语音模型
|
3 |
+
emoji: 🎤
|
4 |
+
colorFrom: gray
|
5 |
+
colorTo: green
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 3.21.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
11 |
|
12 |
+
四万(Onlyswan)官方语音模型
|
app.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
import librosa
|
5 |
+
import numpy as np
|
6 |
+
import soundfile
|
7 |
+
|
8 |
+
|
9 |
+
def vc_fn(sid, input_audio, vc_transform, auto_f0, noise_scale, db_threshold, f0_method):
|
10 |
+
|
11 |
+
try:
|
12 |
+
os.remove("temp.wav")
|
13 |
+
os.remove("temp.out.wav")
|
14 |
+
except OSError:
|
15 |
+
pass
|
16 |
+
|
17 |
+
if input_audio is None:
|
18 |
+
return "You need to upload an audio", None
|
19 |
+
sampling_rate, audio = input_audio
|
20 |
+
|
21 |
+
duration = audio.shape[0] / sampling_rate
|
22 |
+
if duration > 30:
|
23 |
+
return "请上传小于30s的音频,需要转换长音频请本地进行转换", None
|
24 |
+
|
25 |
+
if auto_f0:
|
26 |
+
auto_f0_flag = "--auto-predict-f0"
|
27 |
+
else:
|
28 |
+
auto_f0_flag = "--no-auto-predict-f0"
|
29 |
+
|
30 |
+
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
|
31 |
+
if len(audio.shape) > 1:
|
32 |
+
audio = librosa.to_mono(audio.transpose(1, 0))
|
33 |
+
if sampling_rate != 44100:
|
34 |
+
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=44100)
|
35 |
+
|
36 |
+
out_wav_path = "temp.wav"
|
37 |
+
soundfile.write(out_wav_path, audio, 44100, format="wav")
|
38 |
+
|
39 |
+
infer_cmd = "svc infer --f0-method %s --db-thresh %s --noise-scale %s --transpose %s %s temp.wav" \
|
40 |
+
% (f0_method, int(db_threshold), noise_scale, int(vc_transform), auto_f0_flag)
|
41 |
+
|
42 |
+
print("Executing command: " + infer_cmd)
|
43 |
+
os.system(infer_cmd)
|
44 |
+
|
45 |
+
out_audio, _ = librosa.load("temp.out.wav", sr=44100)
|
46 |
+
return "成功", (44100, out_audio.numpy())
|
47 |
+
|
48 |
+
|
49 |
+
app = gr.Blocks()
|
50 |
+
with app:
|
51 |
+
with gr.Tabs():
|
52 |
+
with gr.TabItem("Basic"):
|
53 |
+
gr.Markdown(value="""
|
54 |
+
Onlyswan官方音频模型
|
55 |
+
==================
|
56 |
+
|
57 |
+
这是一个基于OnlySwan的官方音频模型Demo,可以将任意歌曲的清唱/干声转换为OnlySwan的音色。严禁将模型用于任何商业项目。
|
58 |
+
|
59 |
+
音频使用长达40分钟的四万原版音频进行训练,训练Epoch为40000步,音色效果更加接近OnlySwan的音色。
|
60 |
+
|
61 |
+
在线转换需要一定时间,请耐心等待。一般每1秒钟的音频需要4秒钟的时间进行转换。如果音频过长,可能会超时,建议本地进行转换。
|
62 |
+
|
63 |
+
""")
|
64 |
+
sid = gr.Dropdown(label="音色选择", choices=["swan"], value="swan")
|
65 |
+
vc_input3 = gr.Audio(label="上传音频(需要使用清唱/干声。长度小于30秒)")
|
66 |
+
vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12) * 建议保持默认", value=0)
|
67 |
+
auto_f0 = gr.Checkbox(label="自动f0预测,配合聚类模型f0预测效果更好,会导致变调功能失效(歌曲不要勾选,语言类建议勾选)", value=False)
|
68 |
+
noise_scale = gr.Number(label="噪音等级。 建议保持默认", value=0.4)
|
69 |
+
db_threshold = gr.Number(label="静音阈值(db)。 建议保持默认", value=-30)
|
70 |
+
f0_method = gr.Dropdown(label="音准预测方法。建议保持默认", choices=["crepe", "parselmouth", "dio", "harvest"], value="crepe")
|
71 |
+
vc_submit = gr.Button("开始生成OnlySwan音色!", variant="primary")
|
72 |
+
vc_output1 = gr.Textbox(label="处理状态")
|
73 |
+
vc_output2 = gr.Audio(label="Swan音频下载。 处理完成后请点击播放按钮试听,并使用右边的三个点按钮菜单下载。")
|
74 |
+
vc_submit.click(vc_fn, [sid, vc_input3, vc_transform, auto_f0, noise_scale, db_threshold, f0_method], [vc_output1, vc_output2])
|
75 |
+
|
76 |
+
app.launch()
|
configs/44k/config.json
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"log_interval": 200,
|
4 |
+
"eval_interval": 800,
|
5 |
+
"seed": 1234,
|
6 |
+
"epochs": 40000,
|
7 |
+
"learning_rate": 0.0001,
|
8 |
+
"betas": [
|
9 |
+
0.8,
|
10 |
+
0.99
|
11 |
+
],
|
12 |
+
"eps": 1e-09,
|
13 |
+
"batch_size": 24,
|
14 |
+
"fp16_run": false,
|
15 |
+
"bf16_run": false,
|
16 |
+
"lr_decay": 0.999875,
|
17 |
+
"segment_size": 10240,
|
18 |
+
"init_lr_ratio": 1,
|
19 |
+
"warmup_epochs": 0,
|
20 |
+
"c_mel": 45,
|
21 |
+
"c_kl": 1.0,
|
22 |
+
"use_sr": true,
|
23 |
+
"max_speclen": 512,
|
24 |
+
"port": "8001",
|
25 |
+
"keep_ckpts": 3,
|
26 |
+
"num_workers": 4,
|
27 |
+
"log_version": 0,
|
28 |
+
"ckpt_name_by_step": false,
|
29 |
+
"accumulate_grad_batches": 1
|
30 |
+
},
|
31 |
+
"data": {
|
32 |
+
"training_files": "filelists/44k/train.txt",
|
33 |
+
"validation_files": "filelists/44k/val.txt",
|
34 |
+
"max_wav_value": 32768.0,
|
35 |
+
"sampling_rate": 44100,
|
36 |
+
"filter_length": 2048,
|
37 |
+
"hop_length": 512,
|
38 |
+
"win_length": 2048,
|
39 |
+
"n_mel_channels": 80,
|
40 |
+
"mel_fmin": 0.0,
|
41 |
+
"mel_fmax": 22050
|
42 |
+
},
|
43 |
+
"model": {
|
44 |
+
"inter_channels": 192,
|
45 |
+
"hidden_channels": 192,
|
46 |
+
"filter_channels": 768,
|
47 |
+
"n_heads": 2,
|
48 |
+
"n_layers": 6,
|
49 |
+
"kernel_size": 3,
|
50 |
+
"p_dropout": 0.1,
|
51 |
+
"resblock": "1",
|
52 |
+
"resblock_kernel_sizes": [
|
53 |
+
3,
|
54 |
+
7,
|
55 |
+
11
|
56 |
+
],
|
57 |
+
"resblock_dilation_sizes": [
|
58 |
+
[
|
59 |
+
1,
|
60 |
+
3,
|
61 |
+
5
|
62 |
+
],
|
63 |
+
[
|
64 |
+
1,
|
65 |
+
3,
|
66 |
+
5
|
67 |
+
],
|
68 |
+
[
|
69 |
+
1,
|
70 |
+
3,
|
71 |
+
5
|
72 |
+
]
|
73 |
+
],
|
74 |
+
"upsample_rates": [
|
75 |
+
8,
|
76 |
+
8,
|
77 |
+
2,
|
78 |
+
2,
|
79 |
+
2
|
80 |
+
],
|
81 |
+
"upsample_initial_channel": 512,
|
82 |
+
"upsample_kernel_sizes": [
|
83 |
+
16,
|
84 |
+
16,
|
85 |
+
4,
|
86 |
+
4,
|
87 |
+
4
|
88 |
+
],
|
89 |
+
"n_layers_q": 3,
|
90 |
+
"use_spectral_norm": false,
|
91 |
+
"gin_channels": 256,
|
92 |
+
"ssl_dim": 256,
|
93 |
+
"n_speakers": 200
|
94 |
+
},
|
95 |
+
"spk": {
|
96 |
+
"swan": 0
|
97 |
+
}
|
98 |
+
}
|
logs/44k/G_40000.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dba0ebbd59490cd980caaf8afea8c3bb6ca0ad5370a17a68c96fd2d153897380
|
3 |
+
size 542789469
|
logs/44k/kmeans_10000.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d5994c260173d19468d7f1b2ac558daee2aeaabee557b57bfe7d875238caab3f
|
3 |
+
size 15446201
|
requirements.txt
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==1.4.0
|
2 |
+
aiofiles==23.1.0
|
3 |
+
aiohttp==3.8.4
|
4 |
+
aiosignal==1.3.1
|
5 |
+
altair==5.0.0
|
6 |
+
anyio==3.6.2
|
7 |
+
arrow==1.2.3
|
8 |
+
async-timeout==4.0.2
|
9 |
+
attrs==23.1.0
|
10 |
+
audioread==3.0.0
|
11 |
+
beautifulsoup4==4.12.2
|
12 |
+
blessed==1.20.0
|
13 |
+
cachetools==5.3.0
|
14 |
+
certifi==2023.5.7
|
15 |
+
cffi==1.15.1
|
16 |
+
charset-normalizer==3.1.0
|
17 |
+
click==8.1.3
|
18 |
+
cm-time==0.1.2
|
19 |
+
contourpy==1.0.7
|
20 |
+
croniter==1.3.14
|
21 |
+
cycler==0.11.0
|
22 |
+
Cython==0.29.34
|
23 |
+
dateutils==0.6.12
|
24 |
+
decorator==5.1.1
|
25 |
+
deepdiff==6.3.0
|
26 |
+
fastapi==0.88.0
|
27 |
+
ffmpy==0.3.0
|
28 |
+
filelock==3.12.0
|
29 |
+
fonttools==4.39.4
|
30 |
+
frozenlist==1.3.3
|
31 |
+
fsspec==2023.5.0
|
32 |
+
google-auth==2.18.0
|
33 |
+
google-auth-oauthlib==1.0.0
|
34 |
+
gradio==3.30.0
|
35 |
+
gradio_client==0.2.4
|
36 |
+
grpcio==1.54.2
|
37 |
+
h11==0.14.0
|
38 |
+
httpcore==0.17.0
|
39 |
+
httpx==0.24.0
|
40 |
+
huggingface-hub==0.14.1
|
41 |
+
idna==3.4
|
42 |
+
importlib-metadata==6.6.0
|
43 |
+
importlib-resources==5.12.0
|
44 |
+
inquirer==3.1.3
|
45 |
+
itsdangerous==2.1.2
|
46 |
+
Jinja2==3.1.2
|
47 |
+
joblib==1.2.0
|
48 |
+
jsonschema==4.17.3
|
49 |
+
kiwisolver==1.4.4
|
50 |
+
librosa==0.9.1
|
51 |
+
lightning==2.0.2
|
52 |
+
lightning-cloud==0.5.36
|
53 |
+
lightning-utilities==0.8.0
|
54 |
+
linkify-it-py==2.0.2
|
55 |
+
llvmlite==0.40.0
|
56 |
+
Markdown==3.4.3
|
57 |
+
markdown-it-py==2.2.0
|
58 |
+
MarkupSafe==2.1.2
|
59 |
+
matplotlib==3.7.1
|
60 |
+
mdit-py-plugins==0.3.3
|
61 |
+
mdurl==0.1.2
|
62 |
+
mpmath==1.3.0
|
63 |
+
multidict==6.0.4
|
64 |
+
networkx==3.1
|
65 |
+
numba==0.57.0
|
66 |
+
numpy==1.24.3
|
67 |
+
oauthlib==3.2.2
|
68 |
+
onnx==1.14.0
|
69 |
+
onnxoptimizer==0.3.13
|
70 |
+
onnxsim==0.4.28
|
71 |
+
ordered-set==4.1.0
|
72 |
+
orjson==3.8.12
|
73 |
+
packaging==23.1
|
74 |
+
pandas==2.0.1
|
75 |
+
Pebble==5.0.3
|
76 |
+
Pillow==9.5.0
|
77 |
+
platformdirs==3.5.1
|
78 |
+
playsound==1.3.0
|
79 |
+
pooch==1.7.0
|
80 |
+
praat-parselmouth==0.4.3
|
81 |
+
protobuf==3.20.3
|
82 |
+
psutil==5.9.5
|
83 |
+
pyasn1==0.5.0
|
84 |
+
pyasn1-modules==0.3.0
|
85 |
+
pycparser==2.21
|
86 |
+
pydantic==1.10.7
|
87 |
+
pydub==0.25.1
|
88 |
+
Pygments==2.15.1
|
89 |
+
PyJWT==2.7.0
|
90 |
+
pyparsing==3.0.9
|
91 |
+
pyrsistent==0.19.3
|
92 |
+
PySimpleGUI==4.60.4
|
93 |
+
python-dateutil==2.8.2
|
94 |
+
python-editor==1.0.4
|
95 |
+
python-multipart==0.0.6
|
96 |
+
pytorch-lightning==2.0.2
|
97 |
+
pytz==2023.3
|
98 |
+
pyworld==0.3.3
|
99 |
+
PyYAML==6.0
|
100 |
+
readchar==4.0.5
|
101 |
+
regex==2023.5.5
|
102 |
+
requests==2.30.0
|
103 |
+
requests-oauthlib==1.3.1
|
104 |
+
resampy==0.4.2
|
105 |
+
rich==13.3.5
|
106 |
+
rsa==4.9
|
107 |
+
scikit-learn==1.2.2
|
108 |
+
scipy==1.10.1
|
109 |
+
semantic-version==2.10.0
|
110 |
+
six==1.16.0
|
111 |
+
sniffio==1.3.0
|
112 |
+
so-vits-svc-fork==3.14.1
|
113 |
+
sounddevice==0.4.6
|
114 |
+
soundfile==0.12.1
|
115 |
+
soupsieve==2.4.1
|
116 |
+
starlette==0.22.0
|
117 |
+
starsessions==1.3.0
|
118 |
+
sympy==1.12
|
119 |
+
tensorboard==2.13.0
|
120 |
+
tensorboard-data-server==0.7.0
|
121 |
+
tensorboardX==2.6
|
122 |
+
threadpoolctl==3.1.0
|
123 |
+
tokenizers==0.13.3
|
124 |
+
toolz==0.12.0
|
125 |
+
torch==2.0.1
|
126 |
+
torchaudio==2.0.2
|
127 |
+
torchcrepe==0.0.19
|
128 |
+
torchmetrics==0.11.4
|
129 |
+
tqdm==4.65.0
|
130 |
+
tqdm-joblib==0.0.3
|
131 |
+
traitlets==5.9.0
|
132 |
+
transformers==4.29.1
|
133 |
+
typing_extensions==4.5.0
|
134 |
+
tzdata==2023.3
|
135 |
+
uc-micro-py==1.0.2
|
136 |
+
urllib3==1.26.15
|
137 |
+
uvicorn==0.22.0
|
138 |
+
wcwidth==0.2.6
|
139 |
+
websocket-client==1.5.1
|
140 |
+
websockets==11.0.3
|
141 |
+
Werkzeug==2.3.4
|
142 |
+
yarl==1.9.2
|
143 |
+
zipp==3.15.0
|