File size: 4,075 Bytes
990357a
fe48c10
4d74158
fe48c10
 
239000b
fe48c10
990357a
 
 
 
6d066e2
 
990357a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d066e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9e95cb
 
4d74158
 
 
8f9de4a
4d74158
f9e95cb
6d066e2
 
 
 
 
 
 
 
acda046
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7fcb0ce
 
f9e95cb
 
 
 
4a4f8b1
f9e95cb
 
 
 
 
6d066e2
7f29d20
 
acda046
 
 
 
 
 
f9e95cb
9243345
acda046
6d066e2
7fcb0ce
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import os
import gradio as gr
import random

os.system("pip install kantts -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html")
os.system("pip install numpy==1.22.0")

from modelscope.models.audio.tts import SambertHifigan
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks

# model_0

model_dir = os.path.abspath("./pretrain_work_dir")

custom_infer_abs = {
    'voice_name':
    'F7',
    'am_ckpt':
    os.path.join(model_dir, 'tmp_am', 'ckpt'),
    'am_config':
    os.path.join(model_dir, 'tmp_am', 'config.yaml'),
    'voc_ckpt':
    os.path.join(model_dir, 'orig_model', 'basemodel_16k', 'hifigan', 'ckpt'),
    'voc_config':
    os.path.join(model_dir, 'orig_model', 'basemodel_16k', 'hifigan',
             'config.yaml'),
    'audio_config':
    os.path.join(model_dir, 'data', 'audio_config.yaml'),
    'se_file':
    os.path.join(model_dir, 'data', 'se', 'se.npy')
}
kwargs = {'custom_ckpt': custom_infer_abs}

model_id = SambertHifigan(os.path.join(model_dir, "orig_model"), **kwargs)

inference = pipeline(task=Tasks.text_to_speech, model=model_id)

# model_1

model_dir1 = os.path.abspath("./jay/pretrain_work_dir")

custom_infer_abs1 = {
    'voice_name':
    'F7',
    'am_ckpt':
    os.path.join(model_dir1, 'tmp_am', 'ckpt'),
    'am_config':
    os.path.join(model_dir1, 'tmp_am', 'config.yaml'),
    'voc_ckpt':
    os.path.join(model_dir1, 'orig_model', 'basemodel_16k', 'hifigan', 'ckpt'),
    'voc_config':
    os.path.join(model_dir1, 'orig_model', 'basemodel_16k', 'hifigan',
             'config.yaml'),
    'audio_config':
    os.path.join(model_dir1, 'data', 'audio_config.yaml'),
    'se_file':
    os.path.join(model_dir1, 'data', 'se', 'se.npy')
}
kwargs1 = {'custom_ckpt': custom_infer_abs1}

model_id1 = SambertHifigan(os.path.join(model_dir1, "orig_model"), **kwargs1)

inference1 = pipeline(task=Tasks.text_to_speech, model=model_id1)


# functions

def infer(text):
    output = inference(input=text)
    filename = str(random.randint(1, 1000000000000))
    
    with open(filename + "myfile.wav", mode='bx') as f:
        f.write(output["output_wav"])
    return filename + "myfile.wav"

def infer1(text):
    output = inference1(input=text)
    filename = str(random.randint(1, 1000000000000))
    
    with open(filename + "file.wav", mode='bx') as f:
        f.write(output["output_wav"])
    return filename + "file.wav"

# upsample

import numpy as np
import torch
from hifi_gan_bwe import BandwidthExtender

MAX_LENGTH = 600.0

model = BandwidthExtender.from_pretrained("hifi-gan-bwe-10-42890e3-vctk-48kHz")

def extend(audio):
    fs, x = audio
    x = x[:int(MAX_LENGTH * fs)]
    x = x.astype(np.float32) / 32767.0
    if len(x.shape) == 1:
        x = x[:, np.newaxis]

    with torch.no_grad():
        y = np.stack([model(torch.from_numpy(x), fs) for x in x.T]).T
        y = (y * 32767.0).astype(np.int16)
        fs = int(model.sample_rate)

    return fs, y


app = gr.Blocks()

with app:
    gr.HTML("<center>"
            "<h1>🥳🎶🎡 - KanTTS中文声音克隆</h1>"
            "</center>")
    gr.Markdown("## <center>🌊 - 更多精彩应用,敬请关注[滔滔AI](http://www.talktalkai.com);滔滔AI,为爱滔滔!💕</center>")

   
    with gr.Row():
        with gr.Column():
            inp = gr.Textbox(lines=5, label="请填写您想要转换的中文文本")
            with gr.Row():
                btn = gr.Button("使用AI娜娜的声音", variant="primary")
                btn1 = gr.Button("使用AI小杰的声音", variant="primary")
        with gr.Column():
            with gr.Row():
                out = gr.Audio(label="为您生成的专属音频", type="filepath")
                out1 = gr.Audio(label="更高采样率的专属音频", type="filepath")
            btn2 = gr.Button("一键提高采样率")
    
        btn.click(fn=infer, inputs=[inp], outputs=[out])
        btn1.click(fn=infer1, inputs=[inp], outputs=[out])
        btn2.click(fn=extend, inputs=[out], outputs=[out1])
        
app.launch(show_error=True)