import os import gradio as gr import random os.system("pip install --upgrade Cython==0.29.35") os.system("pip install pysptk --no-build-isolation") os.system("pip install kantts -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html") os.system("pip install librosa==0.9.2") os.system("pip install numpy==1.22.0") from modelscope.models.audio.tts import SambertHifigan from modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks from voicefixer import VoiceFixer voicefixer = VoiceFixer() # model_0 model_dir = os.path.abspath("./pretrain_work_dir") custom_infer_abs = { 'voice_name': 'F7', 'am_ckpt': os.path.join(model_dir, 'tmp_am', 'ckpt'), 'am_config': os.path.join(model_dir, 'tmp_am', 'config.yaml'), 'voc_ckpt': os.path.join(model_dir, 'orig_model', 'basemodel_16k', 'hifigan', 'ckpt'), 'voc_config': os.path.join(model_dir, 'orig_model', 'basemodel_16k', 'hifigan', 'config.yaml'), 'audio_config': os.path.join(model_dir, 'data', 'audio_config.yaml'), 'se_file': os.path.join(model_dir, 'data', 'se', 'se.npy') } kwargs = {'custom_ckpt': custom_infer_abs} model_id = SambertHifigan(os.path.join(model_dir, "orig_model"), **kwargs) inference = pipeline(task=Tasks.text_to_speech, model=model_id) # model_1 model_dir1 = os.path.abspath("./jay/pretrain_work_dir") custom_infer_abs1 = { 'voice_name': 'F7', 'am_ckpt': os.path.join(model_dir1, 'tmp_am', 'ckpt'), 'am_config': os.path.join(model_dir1, 'tmp_am', 'config.yaml'), 'voc_ckpt': os.path.join(model_dir1, 'orig_model', 'basemodel_16k', 'hifigan', 'ckpt'), 'voc_config': os.path.join(model_dir1, 'orig_model', 'basemodel_16k', 'hifigan', 'config.yaml'), 'audio_config': os.path.join(model_dir1, 'data', 'audio_config.yaml'), 'se_file': os.path.join(model_dir1, 'data', 'se', 'se.npy') } kwargs1 = {'custom_ckpt': custom_infer_abs1} model_id1 = SambertHifigan(os.path.join(model_dir1, "orig_model"), **kwargs1) inference1 = pipeline(task=Tasks.text_to_speech, model=model_id1) # functions def infer(text): output = inference(input=text) filename = str(random.randint(1, 1000000000000)) with open(filename + "myfile.wav", mode='bx') as f: f.write(output["output_wav"]) return filename + "myfile.wav" def infer1(text): output = inference1(input=text) filename = str(random.randint(1, 1000000000000)) with open(filename + "file.wav", mode='bx') as f: f.write(output["output_wav"]) return filename + "file.wav" # upsample import numpy as np import torch from hifi_gan_bwe import BandwidthExtender from scipy.io.wavfile import write MAX_LENGTH = 600.0 model = BandwidthExtender.from_pretrained("hifi-gan-bwe-10-42890e3-vctk-48kHz") def extend(audio): fs, x = audio x = x[:int(MAX_LENGTH * fs)] x = x.astype(np.float32) / 32767.0 if len(x.shape) == 1: x = x[:, np.newaxis] with torch.no_grad(): y = np.stack([model(torch.from_numpy(x), fs) for x in x.T]).T y = (y * 32767.0).astype(np.int16) fs = int(model.sample_rate) write("upsample.wav", fs, y) return "upsample.wav" # denoise def inference_denoise(audio): voicefixer.restore(input=audio, # input wav file path output="output.wav", # output wav file path cuda=True, # whether to use gpu acceleration mode = int(0)) # You can try out mode 0, 1 to find out the best result return 'output.wav' app = gr.Blocks() with app: with gr.Row(): with gr.Column(): inp = gr.Textbox(lines=5, label="请填写您想要转换的中文文本") with gr.Row(): btn = gr.Button("使用第一种声音", variant="primary") btn1 = gr.Button("使用第二种声音", variant="primary") with gr.Column(): with gr.Row(): out = gr.Audio(label="为您生成的专属音频") out1 = gr.Audio(label="更高采样率的专属音频", type="filepath") out2 = gr.Audio(label="降噪后的高采样率音频", type="filepath") with gr.Row(): btn2 = gr.Button("一键提高采样率") btn3 = gr.Button("一键降噪") btn.click(fn=infer, inputs=[inp], outputs=[out]) btn1.click(fn=infer1, inputs=[inp], outputs=[out]) btn2.click(fn=extend, inputs=[out], outputs=[out1]) btn3.click(fn=inference_denoise, inputs=[out1], outputs=[out2]) gr.Markdown("###
注意❗:请不要生成会对个人以及组织造成侵害的内容,此程序仅供科研、学习及个人娱乐使用。
") gr.HTML(''' ''') app.launch(show_error=True)