Li Fang commited on
Commit
d32a8bf
·
1 Parent(s): 5fdef9f

initial commit

Browse files
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ venv/
2
+ .DS_Store
README.md CHANGED
@@ -1,12 +1,12 @@
1
  ---
2
- title: Swan Voice
3
- emoji: 🦀
4
- colorFrom: pink
5
- colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 3.29.0
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Onlyswan官方语音模型
3
+ emoji: 🎤
4
+ colorFrom: gray
5
+ colorTo: green
6
  sdk: gradio
7
+ sdk_version: 3.21.0
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
+ 四万(Onlyswan)官方语音模型
app.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import gradio as gr
4
+ import librosa
5
+ import numpy as np
6
+ import soundfile
7
+
8
+
9
+ def vc_fn(sid, input_audio, vc_transform, auto_f0, noise_scale, db_threshold, f0_method):
10
+
11
+ try:
12
+ os.remove("temp.wav")
13
+ os.remove("temp.out.wav")
14
+ except OSError:
15
+ pass
16
+
17
+ if input_audio is None:
18
+ return "You need to upload an audio", None
19
+ sampling_rate, audio = input_audio
20
+
21
+ duration = audio.shape[0] / sampling_rate
22
+ if duration > 30:
23
+ return "请上传小于30s的音频,需要转换长音频请本地进行转换", None
24
+
25
+ if auto_f0:
26
+ auto_f0_flag = "--auto-predict-f0"
27
+ else:
28
+ auto_f0_flag = "--no-auto-predict-f0"
29
+
30
+ audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
31
+ if len(audio.shape) > 1:
32
+ audio = librosa.to_mono(audio.transpose(1, 0))
33
+ if sampling_rate != 44100:
34
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=44100)
35
+
36
+ out_wav_path = "temp.wav"
37
+ soundfile.write(out_wav_path, audio, 44100, format="wav")
38
+
39
+ infer_cmd = "svc infer --f0-method %s --db-thresh %s --noise-scale %s --transpose %s %s temp.wav" \
40
+ % (f0_method, int(db_threshold), noise_scale, int(vc_transform), auto_f0_flag)
41
+
42
+ print("Executing command: " + infer_cmd)
43
+ os.system(infer_cmd)
44
+
45
+ out_audio, _ = librosa.load("temp.out.wav", sr=44100)
46
+ return "成功", (44100, out_audio.numpy())
47
+
48
+
49
+ app = gr.Blocks()
50
+ with app:
51
+ with gr.Tabs():
52
+ with gr.TabItem("Basic"):
53
+ gr.Markdown(value="""
54
+ Onlyswan官方音频模型
55
+ ==================
56
+
57
+ 这是一个基于OnlySwan的官方音频模型Demo,可以将任意歌曲的清唱/干声转换为OnlySwan的音色。严禁将模型用于任何商业项目。
58
+
59
+ 音频使用长达40分钟的四万原版音频进行训练,训练Epoch为40000步,音色效果更加接近OnlySwan的音色。
60
+
61
+ 在线转换需要一定时间,请耐心等待。一般每1秒钟的音频需要4秒钟的时间进行转换。如果音频过长,可能会超时,建议本地进行转换。
62
+
63
+ """)
64
+ sid = gr.Dropdown(label="音色选择", choices=["swan"], value="swan")
65
+ vc_input3 = gr.Audio(label="上传音频(需要使用清唱/干声。长度小于30秒)")
66
+ vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12) * 建议保持默认", value=0)
67
+ auto_f0 = gr.Checkbox(label="自动f0预测,配合聚类模型f0预测效果更好,会导致变调功能失效(歌曲不要勾选,语言类建议勾选)", value=False)
68
+ noise_scale = gr.Number(label="噪音等级。 建议保持默认", value=0.4)
69
+ db_threshold = gr.Number(label="静音阈值(db)。 建议保持默认", value=-30)
70
+ f0_method = gr.Dropdown(label="音准预测方法。建议保持默认", choices=["crepe", "parselmouth", "dio", "harvest"], value="crepe")
71
+ vc_submit = gr.Button("开始生成OnlySwan音色!", variant="primary")
72
+ vc_output1 = gr.Textbox(label="处理状态")
73
+ vc_output2 = gr.Audio(label="Swan音频下载。 处理完成后请点击播放按钮试听,并使用右边的三个点按钮菜单下载。")
74
+ vc_submit.click(vc_fn, [sid, vc_input3, vc_transform, auto_f0, noise_scale, db_threshold, f0_method], [vc_output1, vc_output2])
75
+
76
+ app.launch()
configs/44k/config.json ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 800,
5
+ "seed": 1234,
6
+ "epochs": 40000,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 24,
14
+ "fp16_run": false,
15
+ "bf16_run": false,
16
+ "lr_decay": 0.999875,
17
+ "segment_size": 10240,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "use_sr": true,
23
+ "max_speclen": 512,
24
+ "port": "8001",
25
+ "keep_ckpts": 3,
26
+ "num_workers": 4,
27
+ "log_version": 0,
28
+ "ckpt_name_by_step": false,
29
+ "accumulate_grad_batches": 1
30
+ },
31
+ "data": {
32
+ "training_files": "filelists/44k/train.txt",
33
+ "validation_files": "filelists/44k/val.txt",
34
+ "max_wav_value": 32768.0,
35
+ "sampling_rate": 44100,
36
+ "filter_length": 2048,
37
+ "hop_length": 512,
38
+ "win_length": 2048,
39
+ "n_mel_channels": 80,
40
+ "mel_fmin": 0.0,
41
+ "mel_fmax": 22050
42
+ },
43
+ "model": {
44
+ "inter_channels": 192,
45
+ "hidden_channels": 192,
46
+ "filter_channels": 768,
47
+ "n_heads": 2,
48
+ "n_layers": 6,
49
+ "kernel_size": 3,
50
+ "p_dropout": 0.1,
51
+ "resblock": "1",
52
+ "resblock_kernel_sizes": [
53
+ 3,
54
+ 7,
55
+ 11
56
+ ],
57
+ "resblock_dilation_sizes": [
58
+ [
59
+ 1,
60
+ 3,
61
+ 5
62
+ ],
63
+ [
64
+ 1,
65
+ 3,
66
+ 5
67
+ ],
68
+ [
69
+ 1,
70
+ 3,
71
+ 5
72
+ ]
73
+ ],
74
+ "upsample_rates": [
75
+ 8,
76
+ 8,
77
+ 2,
78
+ 2,
79
+ 2
80
+ ],
81
+ "upsample_initial_channel": 512,
82
+ "upsample_kernel_sizes": [
83
+ 16,
84
+ 16,
85
+ 4,
86
+ 4,
87
+ 4
88
+ ],
89
+ "n_layers_q": 3,
90
+ "use_spectral_norm": false,
91
+ "gin_channels": 256,
92
+ "ssl_dim": 256,
93
+ "n_speakers": 200
94
+ },
95
+ "spk": {
96
+ "swan": 0
97
+ }
98
+ }
logs/44k/G_40000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dba0ebbd59490cd980caaf8afea8c3bb6ca0ad5370a17a68c96fd2d153897380
3
+ size 542789469
logs/44k/kmeans_10000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5994c260173d19468d7f1b2ac558daee2aeaabee557b57bfe7d875238caab3f
3
+ size 15446201
requirements.txt ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==1.4.0
2
+ aiofiles==23.1.0
3
+ aiohttp==3.8.4
4
+ aiosignal==1.3.1
5
+ altair==5.0.0
6
+ anyio==3.6.2
7
+ arrow==1.2.3
8
+ async-timeout==4.0.2
9
+ attrs==23.1.0
10
+ audioread==3.0.0
11
+ beautifulsoup4==4.12.2
12
+ blessed==1.20.0
13
+ cachetools==5.3.0
14
+ certifi==2023.5.7
15
+ cffi==1.15.1
16
+ charset-normalizer==3.1.0
17
+ click==8.1.3
18
+ cm-time==0.1.2
19
+ contourpy==1.0.7
20
+ croniter==1.3.14
21
+ cycler==0.11.0
22
+ Cython==0.29.34
23
+ dateutils==0.6.12
24
+ decorator==5.1.1
25
+ deepdiff==6.3.0
26
+ fastapi==0.88.0
27
+ ffmpy==0.3.0
28
+ filelock==3.12.0
29
+ fonttools==4.39.4
30
+ frozenlist==1.3.3
31
+ fsspec==2023.5.0
32
+ google-auth==2.18.0
33
+ google-auth-oauthlib==1.0.0
34
+ gradio==3.30.0
35
+ gradio_client==0.2.4
36
+ grpcio==1.54.2
37
+ h11==0.14.0
38
+ httpcore==0.17.0
39
+ httpx==0.24.0
40
+ huggingface-hub==0.14.1
41
+ idna==3.4
42
+ importlib-metadata==6.6.0
43
+ importlib-resources==5.12.0
44
+ inquirer==3.1.3
45
+ itsdangerous==2.1.2
46
+ Jinja2==3.1.2
47
+ joblib==1.2.0
48
+ jsonschema==4.17.3
49
+ kiwisolver==1.4.4
50
+ librosa==0.9.1
51
+ lightning==2.0.2
52
+ lightning-cloud==0.5.36
53
+ lightning-utilities==0.8.0
54
+ linkify-it-py==2.0.2
55
+ llvmlite==0.40.0
56
+ Markdown==3.4.3
57
+ markdown-it-py==2.2.0
58
+ MarkupSafe==2.1.2
59
+ matplotlib==3.7.1
60
+ mdit-py-plugins==0.3.3
61
+ mdurl==0.1.2
62
+ mpmath==1.3.0
63
+ multidict==6.0.4
64
+ networkx==3.1
65
+ numba==0.57.0
66
+ numpy==1.24.3
67
+ oauthlib==3.2.2
68
+ onnx==1.14.0
69
+ onnxoptimizer==0.3.13
70
+ onnxsim==0.4.28
71
+ ordered-set==4.1.0
72
+ orjson==3.8.12
73
+ packaging==23.1
74
+ pandas==2.0.1
75
+ Pebble==5.0.3
76
+ Pillow==9.5.0
77
+ platformdirs==3.5.1
78
+ playsound==1.3.0
79
+ pooch==1.7.0
80
+ praat-parselmouth==0.4.3
81
+ protobuf==3.20.3
82
+ psutil==5.9.5
83
+ pyasn1==0.5.0
84
+ pyasn1-modules==0.3.0
85
+ pycparser==2.21
86
+ pydantic==1.10.7
87
+ pydub==0.25.1
88
+ Pygments==2.15.1
89
+ PyJWT==2.7.0
90
+ pyparsing==3.0.9
91
+ pyrsistent==0.19.3
92
+ PySimpleGUI==4.60.4
93
+ python-dateutil==2.8.2
94
+ python-editor==1.0.4
95
+ python-multipart==0.0.6
96
+ pytorch-lightning==2.0.2
97
+ pytz==2023.3
98
+ pyworld==0.3.3
99
+ PyYAML==6.0
100
+ readchar==4.0.5
101
+ regex==2023.5.5
102
+ requests==2.30.0
103
+ requests-oauthlib==1.3.1
104
+ resampy==0.4.2
105
+ rich==13.3.5
106
+ rsa==4.9
107
+ scikit-learn==1.2.2
108
+ scipy==1.10.1
109
+ semantic-version==2.10.0
110
+ six==1.16.0
111
+ sniffio==1.3.0
112
+ so-vits-svc-fork==3.14.1
113
+ sounddevice==0.4.6
114
+ soundfile==0.12.1
115
+ soupsieve==2.4.1
116
+ starlette==0.22.0
117
+ starsessions==1.3.0
118
+ sympy==1.12
119
+ tensorboard==2.13.0
120
+ tensorboard-data-server==0.7.0
121
+ tensorboardX==2.6
122
+ threadpoolctl==3.1.0
123
+ tokenizers==0.13.3
124
+ toolz==0.12.0
125
+ torch==2.0.1
126
+ torchaudio==2.0.2
127
+ torchcrepe==0.0.19
128
+ torchmetrics==0.11.4
129
+ tqdm==4.65.0
130
+ tqdm-joblib==0.0.3
131
+ traitlets==5.9.0
132
+ transformers==4.29.1
133
+ typing_extensions==4.5.0
134
+ tzdata==2023.3
135
+ uc-micro-py==1.0.2
136
+ urllib3==1.26.15
137
+ uvicorn==0.22.0
138
+ wcwidth==0.2.6
139
+ websocket-client==1.5.1
140
+ websockets==11.0.3
141
+ Werkzeug==2.3.4
142
+ yarl==1.9.2
143
+ zipp==3.15.0