File size: 6,642 Bytes
2fe62fa
7e10b0f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4592ec
7e10b0f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4592ec
7e10b0f
 
 
 
 
 
 
 
 
 
13c3b46
 
7e10b0f
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#colab克隆并且安装完环境后启动该文件
import romajitable
import re
import numpy as np
import logging
logging.getLogger('numba').setLevel(logging.WARNING)
import IPython.display as ipd
import torch
import commons
import utils
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence
import gradio as gr
import time
def get_text(text, hps):
    text_norm = text_to_sequence(text, symbols, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm
dev = torch.device("cuda:0")
def selection(speaker):
    if speaker == "高咲侑":
        spk = 0
        return spk

    elif speaker == "歩夢":
        spk = 1
        return spk

    elif speaker == "かすみ":
        spk = 2
        return spk

    elif speaker == "しずく":
        spk = 3
        return spk

    elif speaker == "果林":
        spk = 4
        return spk
    
    elif speaker == "愛":
        spk = 5
        return spk

    elif speaker == "彼方":
        spk = 6
        return spk

    elif speaker == "せつ菜":
        spk = 7
        return spk
    elif speaker == "エマ":
        spk = 8
        return spk
    elif speaker == "璃奈":
        spk = 9
        return spk
    elif speaker == "栞子":
        spk = 10
        return spk
    elif speaker == "ランジュ":
        spk = 11
        return spk
    elif speaker == "ミア":
        spk = 12
        return spk
    elif speaker == "三色绘恋1":
        spk = 13
        return spk
    elif speaker == "三色绘恋2":
        spk = 15
    elif speaker == "派蒙":
        spk = 16
        return spk
def is_japanese(string):
        for ch in string:
            if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
                return True
        return False
def is_english(string):
    import re
    pattern = re.compile('^[A-Za-z0-9.,:;!?()_*"\' ]+$')
    if pattern.fullmatch(string):
        return True
    else:
        return False
def sle(language,tts_input0):
    if language == "中文":
        tts_input1 = "[ZH]" + tts_input0.replace('\n','。').replace(' ',',') + "[ZH]"
        return tts_input1
    if language == "自动":
        tts_input1 = f"[JA]{tts_input0}[JA]" if is_japanese(tts_input0) else f"[ZH]{tts_input0}[ZH]"
        return tts_input1
    elif language == "日文":
        tts_input1 = "[JA]" + tts_input0.replace('\n','。').replace(' ',',') + "[JA]"
        return tts_input1
def extrac(text):
    text = re.sub("<[^>]*>","",text)
    result_list = re.split(r'\n', text)
    final_list = []
    for i in result_list:
        if is_english(i):
            i = romajitable.to_kana(i).katakana
        i = i.replace('\n','').replace(' ','')
        #Current length of single sentence: 20 
        if len(i)>1:
            if len(i) > 20:
                try:
                    cur_list = re.split(r'。|!', i)
                    for i in cur_list:
                        if len(i)>1:
                            final_list.append(i+'。')
                except:
                    pass
            else:
                final_list.append(i)
    final_list = [x for x in final_list if x != '']
    print(final_list)
    return final_list
def infer(language,text,speaker_id, n_scale= 0.667,n_scale_w = 0.8, l_scale = 1 ):
    speaker_id = int(selection(speaker_id))
    a = ['【','[','(','(']
    b = ['】',']',')',')']
    for i in a:
        text = text.replace(i,'<')
    for i in b:
        text = text.replace(i,'>')
    final_list = extrac(text.replace('“','').replace('”',''))
    audio_fin = []
    c = 0
    for sentence in final_list:
        c +=1
        try:
            stn_tst = get_text(sle(language,sentence), hps_ms)
            with torch.no_grad():
                x_tst = stn_tst.unsqueeze(0).to(dev)
                x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
                sid = torch.LongTensor([speaker_id]).to(dev)
                t1 = time.time()
                audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
                t2 = time.time()
                spending_time = "第"+str(c)+"句的推理时间为:"+str(t2-t1)+"s"
                print(spending_time)
                audio_fin.append(audio)
        except:
            print('存在非法字符')
    return (hps_ms.data.sampling_rate, np.concatenate(audio_fin))
lan = ["中文","日文","自动"]
idols = ["高咲侑","歩夢","かすみ","しずく","果林","愛","せつ菜","璃奈","栞子","エマ","ランジュ","ミア","派蒙"]
hps_ms = utils.get_hparams_from_file("lovelive/config.json")
net_g_ms = SynthesizerTrn(
    len(symbols),
    hps_ms.data.filter_length // 2 + 1,
    hps_ms.train.segment_size // hps_ms.data.hop_length,
    n_speakers=hps_ms.data.n_speakers,
    **hps_ms.model).to(dev)
_ = net_g_ms.eval()
_ = utils.load_checkpoint("lovelive/G_525000.pth", net_g_ms)
app = gr.Blocks()
with app:
    with gr.Tabs():

        with gr.TabItem("虹团vits模型,现可按句分割实现长文本合成,可自行用export_to_onnx.py导出"):

            tts_input1 = gr.TextArea(label="如需实现快速合成,建议在colab上克隆后运行本仓库", value="为什么你会那么熟练啊?你和雪菜亲过多少次了?我想做只属于你一个人的学院偶像,所以,请只注视我一个人,好吗?【中文】\nなんでそんなに慣れてんだよっ?せつ菜と…何回キスしたんだよ?どこまであたしを置いてきぼりにすれば気が済むんだよ?[日文]\nI can't choose just one(English)")
            language = gr.Dropdown(label="选择语言,目前勉强可以做到自动识别",choices=lan, value="自动", interactive=True)
            para_input1 = gr.Slider(minimum= 0,maximum=1.0,label="更改噪声比例,以控制情感", value=0.667)
            para_input2 = gr.Slider(minimum= 0,maximum=1.0,label="更改噪声偏差,以控制音素长短", value=0.7)
            para_input3 = gr.Slider(minimum= 0.1,maximum=10,label="更改时间比例", value=1)
            tts_submit = gr.Button("Generate", variant="primary")
            speaker1 = gr.Dropdown(label="选择说话人",choices=idols, value="歩夢", interactive=True)
            tts_output2 = gr.Audio(label="Output")
            tts_submit.click(infer, [language,tts_input1,speaker1,para_input1,para_input2,para_input3], [tts_output2])
    app.launch(share=True)
    #app.launch()