Spaces:

onlyswan
/

swan-voice

Sleeping

App Files Files Community

Li Fang commited on May 15, 2023

Commit

d32a8bf

1 Parent(s): 5fdef9f

initial commit

Browse files

Files changed (7) hide show

.gitignore +2 -0
README.md +6 -6
app.py +76 -0
configs/44k/config.json +98 -0
logs/44k/G_40000.pth +3 -0
logs/44k/kmeans_10000.pt +3 -0
requirements.txt +143 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ venv/
2	+ .DS_Store

README.md CHANGED Viewed

@@ -1,12 +1,12 @@
 ---
-title: Swan Voice
-emoji: 🦀
-colorFrom: pink
-colorTo: indigo
 sdk: gradio
-sdk_version: 3.29.0
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Onlyswan官方语音模型
+emoji: 🎤
+colorFrom: gray
+colorTo: green
 sdk: gradio
+sdk_version: 3.21.0
 app_file: app.py
 pinned: false
 ---
+四万(Onlyswan)官方语音模型

app.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import os
+import gradio as gr
+import librosa
+import numpy as np
+import soundfile
+def vc_fn(sid, input_audio, vc_transform, auto_f0, noise_scale, db_threshold, f0_method):
+    try:
+        os.remove("temp.wav")
+        os.remove("temp.out.wav")
+    except OSError:
+        pass
+    if input_audio is None:
+        return "You need to upload an audio", None
+    sampling_rate, audio = input_audio
+    duration = audio.shape[0] / sampling_rate
+    if duration > 30:
+        return "请上传小于30s的音频，需要转换长音频请本地进行转换", None
+    if auto_f0:
+        auto_f0_flag = "--auto-predict-f0"
+    else:
+        auto_f0_flag = "--no-auto-predict-f0"
+    audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
+    if len(audio.shape) > 1:
+        audio = librosa.to_mono(audio.transpose(1, 0))
+    if sampling_rate != 44100:
+        audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=44100)
+    out_wav_path = "temp.wav"
+    soundfile.write(out_wav_path, audio, 44100, format="wav")
+    infer_cmd = "svc infer --f0-method %s --db-thresh %s --noise-scale %s --transpose %s %s temp.wav" \
+        % (f0_method, int(db_threshold), noise_scale, int(vc_transform), auto_f0_flag)
+    print("Executing command: " + infer_cmd)
+    os.system(infer_cmd)
+    out_audio, _ = librosa.load("temp.out.wav", sr=44100)
+    return "成功", (44100, out_audio.numpy())
+app = gr.Blocks()
+with app:
+    with gr.Tabs():
+        with gr.TabItem("Basic"):
+            gr.Markdown(value="""
+                Onlyswan官方音频模型
+                ==================
+                这是一个基于OnlySwan的官方音频模型Demo，可以将任意歌曲的清唱/干声转换为OnlySwan的音色。严禁将模型用于任何商业项目。
+                音频使用长达40分钟的四万原版音频进行训练，训练Epoch为40000步，音色效果更加接近OnlySwan的音色。
+                在线转换需要一定时间，请耐心等待。一般每1秒钟的音频需要4秒钟的时间进行转换。如果音频过长，可能会超时，建议本地进行转换。
+                """)
+            sid = gr.Dropdown(label="音色选择", choices=["swan"], value="swan")
+            vc_input3 = gr.Audio(label="上传音频（需要使用清唱/干声。长度小于30秒）")
+            vc_transform = gr.Number(label="变调（整数，可以正负，半音数量，升高八度就是12） * 建议保持默认", value=0)
+            auto_f0 = gr.Checkbox(label="自动f0预测，配合聚类模型f0预测效果更好,会导致变调功能失效（歌曲不要勾选，语言类建议勾选）", value=False)
+            noise_scale = gr.Number(label="噪音等级。 建议保持默认", value=0.4)
+            db_threshold = gr.Number(label="静音阈值(db)。 建议保持默认", value=-30)
+            f0_method = gr.Dropdown(label="音准预测方法。建议保持默认", choices=["crepe", "parselmouth", "dio", "harvest"], value="crepe")
+            vc_submit = gr.Button("开始生成OnlySwan音色！", variant="primary")
+            vc_output1 = gr.Textbox(label="处理状态")
+            vc_output2 = gr.Audio(label="Swan音频下载。 处理完成后请点击播放按钮试听，并使用右边的三个点按钮菜单下载。")
+        vc_submit.click(vc_fn, [sid, vc_input3, vc_transform, auto_f0, noise_scale, db_threshold, f0_method], [vc_output1, vc_output2])
+    app.launch()

configs/44k/config.json ADDED Viewed

	@@ -0,0 +1,98 @@

+{
+  "train": {
+    "log_interval": 200,
+    "eval_interval": 800,
+    "seed": 1234,
+    "epochs": 40000,
+    "learning_rate": 0.0001,
+    "betas": [
+      0.8,
+      0.99
+    ],
+    "eps": 1e-09,
+    "batch_size": 24,
+    "fp16_run": false,
+    "bf16_run": false,
+    "lr_decay": 0.999875,
+    "segment_size": 10240,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0,
+    "use_sr": true,
+    "max_speclen": 512,
+    "port": "8001",
+    "keep_ckpts": 3,
+    "num_workers": 4,
+    "log_version": 0,
+    "ckpt_name_by_step": false,
+    "accumulate_grad_batches": 1
+  },
+  "data": {
+    "training_files": "filelists/44k/train.txt",
+    "validation_files": "filelists/44k/val.txt",
+    "max_wav_value": 32768.0,
+    "sampling_rate": 44100,
+    "filter_length": 2048,
+    "hop_length": 512,
+    "win_length": 2048,
+    "n_mel_channels": 80,
+    "mel_fmin": 0.0,
+    "mel_fmax": 22050
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [
+      3,
+      7,
+      11
+    ],
+    "resblock_dilation_sizes": [
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ]
+    ],
+    "upsample_rates": [
+      8,
+      8,
+      2,
+      2,
+      2
+    ],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [
+      16,
+      16,
+      4,
+      4,
+      4
+    ],
+    "n_layers_q": 3,
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "ssl_dim": 256,
+    "n_speakers": 200
+  },
+  "spk": {
+    "swan": 0
+  }
+}

logs/44k/G_40000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dba0ebbd59490cd980caaf8afea8c3bb6ca0ad5370a17a68c96fd2d153897380
+size 542789469

logs/44k/kmeans_10000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d5994c260173d19468d7f1b2ac558daee2aeaabee557b57bfe7d875238caab3f
+size 15446201

requirements.txt ADDED Viewed

	@@ -0,0 +1,143 @@

+absl-py==1.4.0
+aiofiles==23.1.0
+aiohttp==3.8.4
+aiosignal==1.3.1
+altair==5.0.0
+anyio==3.6.2
+arrow==1.2.3
+async-timeout==4.0.2
+attrs==23.1.0
+audioread==3.0.0
+beautifulsoup4==4.12.2
+blessed==1.20.0
+cachetools==5.3.0
+certifi==2023.5.7
+cffi==1.15.1
+charset-normalizer==3.1.0
+click==8.1.3
+cm-time==0.1.2
+contourpy==1.0.7
+croniter==1.3.14
+cycler==0.11.0
+Cython==0.29.34
+dateutils==0.6.12
+decorator==5.1.1
+deepdiff==6.3.0
+fastapi==0.88.0
+ffmpy==0.3.0
+filelock==3.12.0
+fonttools==4.39.4
+frozenlist==1.3.3
+fsspec==2023.5.0
+google-auth==2.18.0
+google-auth-oauthlib==1.0.0
+gradio==3.30.0
+gradio_client==0.2.4
+grpcio==1.54.2
+h11==0.14.0
+httpcore==0.17.0
+httpx==0.24.0
+huggingface-hub==0.14.1
+idna==3.4
+importlib-metadata==6.6.0
+importlib-resources==5.12.0
+inquirer==3.1.3
+itsdangerous==2.1.2
+Jinja2==3.1.2
+joblib==1.2.0
+jsonschema==4.17.3
+kiwisolver==1.4.4
+librosa==0.9.1
+lightning==2.0.2
+lightning-cloud==0.5.36
+lightning-utilities==0.8.0
+linkify-it-py==2.0.2
+llvmlite==0.40.0
+Markdown==3.4.3
+markdown-it-py==2.2.0
+MarkupSafe==2.1.2
+matplotlib==3.7.1
+mdit-py-plugins==0.3.3
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.0.4
+networkx==3.1
+numba==0.57.0
+numpy==1.24.3
+oauthlib==3.2.2
+onnx==1.14.0
+onnxoptimizer==0.3.13
+onnxsim==0.4.28
+ordered-set==4.1.0
+orjson==3.8.12
+packaging==23.1
+pandas==2.0.1
+Pebble==5.0.3
+Pillow==9.5.0
+platformdirs==3.5.1
+playsound==1.3.0
+pooch==1.7.0
+praat-parselmouth==0.4.3
+protobuf==3.20.3
+psutil==5.9.5
+pyasn1==0.5.0
+pyasn1-modules==0.3.0
+pycparser==2.21
+pydantic==1.10.7
+pydub==0.25.1
+Pygments==2.15.1
+PyJWT==2.7.0
+pyparsing==3.0.9
+pyrsistent==0.19.3
+PySimpleGUI==4.60.4
+python-dateutil==2.8.2
+python-editor==1.0.4
+python-multipart==0.0.6
+pytorch-lightning==2.0.2
+pytz==2023.3
+pyworld==0.3.3
+PyYAML==6.0
+readchar==4.0.5
+regex==2023.5.5
+requests==2.30.0
+requests-oauthlib==1.3.1
+resampy==0.4.2
+rich==13.3.5
+rsa==4.9
+scikit-learn==1.2.2
+scipy==1.10.1
+semantic-version==2.10.0
+six==1.16.0
+sniffio==1.3.0
+so-vits-svc-fork==3.14.1
+sounddevice==0.4.6
+soundfile==0.12.1
+soupsieve==2.4.1
+starlette==0.22.0
+starsessions==1.3.0
+sympy==1.12
+tensorboard==2.13.0
+tensorboard-data-server==0.7.0
+tensorboardX==2.6
+threadpoolctl==3.1.0
+tokenizers==0.13.3
+toolz==0.12.0
+torch==2.0.1
+torchaudio==2.0.2
+torchcrepe==0.0.19
+torchmetrics==0.11.4
+tqdm==4.65.0
+tqdm-joblib==0.0.3
+traitlets==5.9.0
+transformers==4.29.1
+typing_extensions==4.5.0
+tzdata==2023.3
+uc-micro-py==1.0.2
+urllib3==1.26.15
+uvicorn==0.22.0
+wcwidth==0.2.6
+websocket-client==1.5.1
+websockets==11.0.3
+Werkzeug==2.3.4
+yarl==1.9.2
+zipp==3.15.0