Personal-TTS

Runtime error

App Files Files Community

Personal-TTS / app.py

softboy

Update app.py

47c765c about 1 year ago

raw

history blame

5.33 kB

	import os
	import gradio as gr
	import random

	os.system("pip install --upgrade Cython==0.29.35")
	os.system("pip install pysptk --no-build-isolation")
	os.system("pip install kantts -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html")
	os.system("pip install librosa==0.9.2")
	os.system("pip install numpy==1.22.4")

	from modelscope.models.audio.tts import SambertHifigan
	from modelscope.pipelines import pipeline
	from modelscope.utils.constant import Tasks

	from voicefixer import VoiceFixer
	voicefixer = VoiceFixer()

	# model_0

	model_dir = os.path.abspath("./pretrain_work_dir")

	custom_infer_abs = {
	'voice_name':
	'F7',
	'am_ckpt':
	os.path.join(model_dir, 'tmp_am', 'ckpt'),
	'am_config':
	os.path.join(model_dir, 'tmp_am', 'config.yaml'),
	'voc_ckpt':
	os.path.join(model_dir, 'orig_model', 'basemodel_16k', 'hifigan', 'ckpt'),
	'voc_config':
	os.path.join(model_dir, 'orig_model', 'basemodel_16k', 'hifigan',
	'config.yaml'),
	'audio_config':
	os.path.join(model_dir, 'data', 'audio_config.yaml'),
	'se_file':
	os.path.join(model_dir, 'data', 'se', 'se.npy')
	}
	kwargs = {'custom_ckpt': custom_infer_abs}

	model_id = SambertHifigan(os.path.join(model_dir, "orig_model"), **kwargs)

	inference = pipeline(task=Tasks.text_to_speech, model=model_id)

	# model_1

	model_dir1 = os.path.abspath("./jay/pretrain_work_dir")

	custom_infer_abs1 = {
	'voice_name':
	'F7',
	'am_ckpt':
	os.path.join(model_dir1, 'tmp_am', 'ckpt'),
	'am_config':
	os.path.join(model_dir1, 'tmp_am', 'config.yaml'),
	'voc_ckpt':
	os.path.join(model_dir1, 'orig_model', 'basemodel_16k', 'hifigan', 'ckpt'),
	'voc_config':
	os.path.join(model_dir1, 'orig_model', 'basemodel_16k', 'hifigan',
	'config.yaml'),
	'audio_config':
	os.path.join(model_dir1, 'data', 'audio_config.yaml'),
	'se_file':
	os.path.join(model_dir1, 'data', 'se', 'se.npy')
	}
	kwargs1 = {'custom_ckpt': custom_infer_abs1}

	model_id1 = SambertHifigan(os.path.join(model_dir1, "orig_model"), **kwargs1)

	inference1 = pipeline(task=Tasks.text_to_speech, model=model_id1)


	# functions

	def infer(text):
	output = inference(input=text)
	filename = str(random.randint(1, 1000000000000))

	with open(filename + "myfile.wav", mode='bx') as f:
	f.write(output["output_wav"])
	return filename + "myfile.wav"

	def infer1(text):
	output = inference1(input=text)
	filename = str(random.randint(1, 1000000000000))

	with open(filename + "file.wav", mode='bx') as f:
	f.write(output["output_wav"])
	return filename + "file.wav"

	# upsample

	import numpy as np
	import torch
	from hifi_gan_bwe import BandwidthExtender
	from scipy.io.wavfile import write

	MAX_LENGTH = 600.0

	model = BandwidthExtender.from_pretrained("hifi-gan-bwe-10-42890e3-vctk-48kHz")

	def extend(audio):
	fs, x = audio
	x = x[:int(MAX_LENGTH * fs)]
	x = x.astype(np.float32) / 32767.0
	if len(x.shape) == 1:
	x = x[:, np.newaxis]

	with torch.no_grad():
	y = np.stack([model(torch.from_numpy(x), fs) for x in x.T]).T
	y = (y * 32767.0).astype(np.int16)
	fs = int(model.sample_rate)
	write("upsample.wav", fs, y)

	return "upsample.wav"

	# denoise

	def inference_denoise(audio):
	voicefixer.restore(input=audio, # input wav file path
	output="output.wav", # output wav file path
	cuda=False, # whether to use gpu acceleration
	mode = int(0)) # You can try out mode 0, 1 to find out the best result
	return 'output.wav'


	app = gr.Blocks()

	with app:
	gr.Markdown("# <center>🥳🎶🎡 - 中文声音克隆</center>")
	gr.Markdown("## <center>🌟 - 训练3分钟，推理5秒钟，中英文自然发音、真实拟声 </center>")
	gr.Markdown("### <center>🌊 - 更多精彩应用，敬请关注(http://www.motionface.cn)💕</center>")

	with gr.Row():
	with gr.Column():
	inp = gr.Textbox(lines=5, label="请填写您想要转换的中文文本")
	with gr.Row():
	btn = gr.Button("使用AI娜娜的声音", variant="primary")
	btn1 = gr.Button("使用AI小杰的声音", variant="primary")
	with gr.Column():
	with gr.Row():
	out = gr.Audio(label="为您生成的专属音频", interactive=False)
	out1 = gr.Audio(label="更高采样率的专属音频", type="filepath", interactive=False)
	out2 = gr.Audio(label="降噪后的高采样率音频", type="filepath", interactive=False)
	with gr.Row():
	btn2 = gr.Button("一键提高采样率")
	btn3 = gr.Button("一键降噪")

	btn.click(fn=infer, inputs=[inp], outputs=[out])
	btn1.click(fn=infer1, inputs=[inp], outputs=[out])
	btn2.click(fn=extend, inputs=[out], outputs=[out1])
	btn3.click(fn=inference_denoise, inputs=[out1], outputs=[out2])

	gr.Markdown("### <center>注意❗：请不要生成会对个人以及组织造成侵害的内容，此程序仅供科研、学习及个人娱乐使用。</center>")
	gr.HTML('''
	<div class="footer">
	<p>🌊🏞️🎶 - motionface.cn
	</p>
	</div>
	''')
	app.launch(show_error=True)