Personal-TTS

Runtime error

File size: 5,332 Bytes

990357a
fe48c10
4d74158
fe48c10
7ff2055
65e1de4
fe48c10
df68519
5894cab
fe48c10
990357a
 
 
 
e0099fc
 
93c3d3d
6d066e2
 
990357a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d066e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9e95cb
 
4d74158
 
 
8f9de4a
4d74158
f9e95cb
6d066e2
 
 
 
 
 
 
 
acda046
 
 
 
 
de47435
acda046
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
de47435
acda046
de47435
acda046
93c3d3d
 
8822333
1f8e9cf
e0099fc
 
 
 
93c3d3d
acda046
7fcb0ce
 
f9e95cb
47c765c
e2d3317
47c765c
f9e95cb
 
 
 
6d066e2
7f29d20
 
acda046
 
e2d3317
 
 
4b8c791
10f106d
 
acda046
f9e95cb
9243345
acda046
10f106d
a8923f2
 
 
 
47c765c
a8923f2
 
 
7fcb0ce

import os
import gradio as gr
import random

os.system("pip install --upgrade Cython==0.29.35")
os.system("pip install pysptk --no-build-isolation")
os.system("pip install kantts -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html")
os.system("pip install librosa==0.9.2")
os.system("pip install numpy==1.22.4")

from modelscope.models.audio.tts import SambertHifigan
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks

from voicefixer import VoiceFixer
voicefixer = VoiceFixer()

# model_0

model_dir = os.path.abspath("./pretrain_work_dir")

custom_infer_abs = {
    'voice_name':
    'F7',
    'am_ckpt':
    os.path.join(model_dir, 'tmp_am', 'ckpt'),
    'am_config':
    os.path.join(model_dir, 'tmp_am', 'config.yaml'),
    'voc_ckpt':
    os.path.join(model_dir, 'orig_model', 'basemodel_16k', 'hifigan', 'ckpt'),
    'voc_config':
    os.path.join(model_dir, 'orig_model', 'basemodel_16k', 'hifigan',
             'config.yaml'),
    'audio_config':
    os.path.join(model_dir, 'data', 'audio_config.yaml'),
    'se_file':
    os.path.join(model_dir, 'data', 'se', 'se.npy')
}
kwargs = {'custom_ckpt': custom_infer_abs}

model_id = SambertHifigan(os.path.join(model_dir, "orig_model"), **kwargs)

inference = pipeline(task=Tasks.text_to_speech, model=model_id)

# model_1

model_dir1 = os.path.abspath("./jay/pretrain_work_dir")

custom_infer_abs1 = {
    'voice_name':
    'F7',
    'am_ckpt':
    os.path.join(model_dir1, 'tmp_am', 'ckpt'),
    'am_config':
    os.path.join(model_dir1, 'tmp_am', 'config.yaml'),
    'voc_ckpt':
    os.path.join(model_dir1, 'orig_model', 'basemodel_16k', 'hifigan', 'ckpt'),
    'voc_config':
    os.path.join(model_dir1, 'orig_model', 'basemodel_16k', 'hifigan',
             'config.yaml'),
    'audio_config':
    os.path.join(model_dir1, 'data', 'audio_config.yaml'),
    'se_file':
    os.path.join(model_dir1, 'data', 'se', 'se.npy')
}
kwargs1 = {'custom_ckpt': custom_infer_abs1}

model_id1 = SambertHifigan(os.path.join(model_dir1, "orig_model"), **kwargs1)

inference1 = pipeline(task=Tasks.text_to_speech, model=model_id1)


# functions

def infer(text):
    output = inference(input=text)
    filename = str(random.randint(1, 1000000000000))
    
    with open(filename + "myfile.wav", mode='bx') as f:
        f.write(output["output_wav"])
    return filename + "myfile.wav"

def infer1(text):
    output = inference1(input=text)
    filename = str(random.randint(1, 1000000000000))
    
    with open(filename + "file.wav", mode='bx') as f:
        f.write(output["output_wav"])
    return filename + "file.wav"

# upsample

import numpy as np
import torch
from hifi_gan_bwe import BandwidthExtender
from scipy.io.wavfile import write

MAX_LENGTH = 600.0

model = BandwidthExtender.from_pretrained("hifi-gan-bwe-10-42890e3-vctk-48kHz")

def extend(audio):
    fs, x = audio
    x = x[:int(MAX_LENGTH * fs)]
    x = x.astype(np.float32) / 32767.0
    if len(x.shape) == 1:
        x = x[:, np.newaxis]

    with torch.no_grad():
        y = np.stack([model(torch.from_numpy(x), fs) for x in x.T]).T
        y = (y * 32767.0).astype(np.int16)
        fs = int(model.sample_rate)
        write("upsample.wav", fs, y)

    return "upsample.wav"

# denoise

def inference_denoise(audio):
    voicefixer.restore(input=audio, # input wav file path
                    output="output.wav", # output wav file path
                    cuda=False, # whether to use gpu acceleration
                    mode = int(0)) # You can try out mode 0, 1 to find out the best result
    return 'output.wav'


app = gr.Blocks()

with app:
    gr.Markdown("# <center>🥳🎶🎡 - 中文声音克隆</center>")
    gr.Markdown("## <center>🌟 - 训练3分钟，推理5秒钟，中英文自然发音、真实拟声 </center>")
    gr.Markdown("### <center>🌊 - 更多精彩应用，敬请关注(http://www.motionface.cn)💕</center>")

    with gr.Row():
        with gr.Column():
            inp = gr.Textbox(lines=5, label="请填写您想要转换的中文文本")
            with gr.Row():
                btn = gr.Button("使用AI娜娜的声音", variant="primary")
                btn1 = gr.Button("使用AI小杰的声音", variant="primary")
        with gr.Column():
            with gr.Row():
                out = gr.Audio(label="为您生成的专属音频", interactive=False)
                out1 = gr.Audio(label="更高采样率的专属音频", type="filepath", interactive=False)
                out2 = gr.Audio(label="降噪后的高采样率音频", type="filepath", interactive=False)
            with gr.Row():
                btn2 = gr.Button("一键提高采样率")
                btn3 = gr.Button("一键降噪")
    
        btn.click(fn=infer, inputs=[inp], outputs=[out])
        btn1.click(fn=infer1, inputs=[inp], outputs=[out])
        btn2.click(fn=extend, inputs=[out], outputs=[out1])
        btn3.click(fn=inference_denoise, inputs=[out1], outputs=[out2])

    gr.Markdown("### <center>注意❗：请不要生成会对个人以及组织造成侵害的内容，此程序仅供科研、学习及个人娱乐使用。</center>")
    gr.HTML('''
        <div class="footer">
                    <p>🌊🏞️🎶 - motionface.cn
                    </p>
        </div>
    ''')
app.launch(show_error=True)