Spaces:

onlyswan
/

swan-voice

Running

swan-voice / app.py

Li Fang

Fixed readme

7882057 about 2 years ago

6.82 kB

	import os
	import subprocess
	import re
	import pathlib

	import gradio as gr
	import librosa
	import numpy as np
	import soundfile


	is_running_in_hf = os.environ.get("SPACE_ID", "") != ""

	README_HEADER = """
	Onlyswan官方音频模型 %s
	==================

	这是一个基于OnlySwan的官方音频模型Demo，可以将任意歌曲的清唱/干声转换为OnlySwan的音色。严禁将模型用于任何商业项目。

	音频使用长达40分钟的四万原版音频进行训练，训练Epoch为40000步，音色效果更加接近OnlySwan的音色。

	请切换到上方👆的 “音频转换” 选项卡，在线转换试用


	在线推理
	-------

	在线转换速度慢且需要一定时间，请耐心等待。一般每1秒钟的音频需要4秒钟的时间进行转换。如果音频超过30秒，可能会超时，强烈建议本地(使用自己的电脑)进行转换。

	使用本地电脑进行转换
	-----------------

	### 方法一：使用Docker （推荐，非常简单）

	1. 安装Docker。Docker安装方法请参考: https://docs.docker.com/get-docker/
	2. 运行命令
	```bash
	docker run --pull always --rm -it -p 7860:7860 --platform=linux/amd64 registry.hf.space/onlyswan-swan-voice:latest python app.py
	```
	3. 在浏览器中打开 `http://localhost:7860`

	### 方法二：手动部署Python环境

	1. 使用 https://github.com/voicepaw/so-vits-svc-fork
	2. 下载模型与配置
	* 模型位于 `logs/44k/G_40000.pth`
	* 配置位于 `configs/44k/config.json`
	3. 运行推理脚本

	⚠️注意
	-----

	请确保上传的原始音频文件为清唱/干声/人声，而不是带有伴奏的歌曲。

	关于如何分离歌曲中的人声与伴奏，推荐使用：

	1. Ultimate Vocal Remover (开源免费，效果好，首推)
	2. https://www.google.com/search?q=ai+vocal+remover (自己尝试不同的网站，效果不一)
	3. https://lalal.ai (收费)


	----------


	""" % ("(正在使用本地推理，无时长限制)" if not is_running_in_hf else "")


	README_HEADER2 = """
	Onlyswan官方音频模型 %s
	==================

	这是一个基于OnlySwan的官方音频模型Demo，可以将任意歌曲的清唱/干声转换为OnlySwan的音色。严禁将模型用于任何商业项目。

	音频使用长达40分钟的四万原版音频进行训练，训练Epoch为40000步，音色效果更加接近OnlySwan的音色。

	-----------

	""" % ("(正在使用本地推理，无时长限制)" if not is_running_in_hf else "")


	def vc_fn(input_audio, vc_transform, auto_f0, noise_scale, db_threshold, f0_method, progress=gr.Progress()):

	try:
	os.remove("temp.wav")
	os.remove("temp.out.wav")
	except OSError:
	pass

	if input_audio is None:
	return "You need to upload an audio", None
	sampling_rate, audio = input_audio

	duration = audio.shape[0] / sampling_rate
	if is_running_in_hf and (duration > 30):
	return "请上传小于30s的音频，需要转换长音频请本地进行转换", None

	if auto_f0:
	auto_f0_flag = "--auto-predict-f0"
	else:
	auto_f0_flag = "--no-auto-predict-f0"

	progress(0, desc="重新采样...")

	audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
	if len(audio.shape) > 1:
	audio = librosa.to_mono(audio.transpose(1, 0))
	if sampling_rate != 44100:
	audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=44100)

	progress(0.01, desc="写入临时文件...")

	out_wav_path = "temp.wav"
	soundfile.write(out_wav_path, audio, 44100, format="wav")

	infer_cmd = "svc infer --f0-method %s --db-thresh %s --noise-scale %s --transpose %s %s temp.wav" \
	% (f0_method, int(db_threshold), noise_scale, int(vc_transform), auto_f0_flag)

	os.environ["PYTHONWARNINGS"] = "ignore"
	os.environ["HUGGINGFACE_HUB_CACHE"] = os.path.join(pathlib.Path().resolve(), "huggingface_cache", "hub")
	print("Executing command: " + infer_cmd)

	progress(0.02, desc="准备模型中...")

	# os.system(infer_cmd)
	progress_pattern = re.compile(r'<<#(\d+.\d+)#>>')
	process = subprocess.Popen(infer_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, shell=True)
	total_chunks = duration * 44100
	complete_chunks = 0

	for ln in iter(process.stdout.readline, ''):
	print(ln, end='')
	match = progress_pattern.search(ln)
	if match:
	current_chunk = float(match.group(1))
	complete_chunks += current_chunk
	progress(complete_chunks / total_chunks, desc="正在转换。请勿关闭窗口或刷新页面...")

	process.wait()

	if not os.path.exists("temp.out.wav"):
	return "发生错误。本次推理所使用的命令为: " + infer_cmd, None
	else:
	return "成功生成OnlySwan音色音频。本次推理所使用的命令为: " + infer_cmd, "temp.out.wav"


	app = gr.Blocks()
	with app:
	with gr.Tabs():
	with gr.TabItem("主页"):
	gr.Markdown(value=README_HEADER)
	demo_audio = gr.Audio("samples/feiniaohechan_sample_vocal.cover_ai_swan.wav", label="示例音频，点击播放试听效果")
	with gr.TabItem("音频转换"):
	gr.Markdown(value=README_HEADER2)
	# sid = gr.Dropdown(label="音色选择", choices=["swan"], value="swan")
	if is_running_in_hf:
	vc_input3 = gr.Audio(label="上传音频（需要使用清唱/干声。长度小于30秒）")
	else:
	vc_input3 = gr.Audio(label="上传音频（需要使用清唱/干声。长度不限）")
	vc_transform = gr.Number(label="变调。建议保持默认（整数，可以正负，半音数量，升高八度就是12）", value=0)
	auto_f0 = gr.Checkbox(label="自动f0预测。歌曲不要勾选，语言类建议勾选。勾选后变调功能失效。", value=False)
	noise_scale = gr.Number(label="噪音等级。建议保持默认", value=0.4)
	db_threshold = gr.Number(label="静音阈值(db)。建议保持默认", value=-30)
	f0_method = gr.Dropdown(label="音准预测方法。建议保持默认", choices=["crepe", "parselmouth", "dio", "harvest"], value="crepe")
	vc_submit = gr.Button("开始生成OnlySwan音色！", variant="primary")
	vc_output1 = gr.Textbox(label="处理状态")
	vc_output2 = gr.Audio(label="Swan音频下载。处理完成后请点击播放按钮试听，并使用右边的三个点按钮菜单下载。")
	vc_submit.click(vc_fn, [vc_input3, vc_transform, auto_f0, noise_scale, db_threshold, f0_method], [vc_output1, vc_output2])

	app.queue(concurrency_count=1)
	print("音频转换器已准备就绪，请打开浏览器访问 http://localhost:7860 开始使用。")
	app.launch()