File size: 15,373 Bytes
e569c5f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
#!/usr/bin/env python3
# -*- coding=utf8 -*-
########################################################################
#
# Copyright (c) 2023 Baidu.com, Inc. All Rights Reserved
#
########################################################################

"""
Author: linxiaolong
"""
import warnings
warnings.filterwarnings("ignore")

# 外部库
import re
import requests
import argparse
import json
import os
import re
import tempfile

import librosa
import numpy as np
# import torch
# from torch import no_grad, LongTensor
import commons
import gradio as gr
import gradio.utils as gr_utils
import gradio.processing_utils as gr_processing_utils

# 内部库
from models import SynthesizerTrn
from text import text_to_sequence, text_to_sequence_for_test, _clean_text
from mel_processing import spectrogram_torch
import utils
from text.symbols import symbols

limitation = os.getenv("SYSTEM") == "spaces"  # limit text and audio length in huggingface spaces
punct_regex = re.compile(r"[\.!\?。!?]")
silence_duration = 200


def split_text(text, regex):
    """Split text into sentences by puncutations.

    Args:
        text: long text.
        regex: puncutation regex.

    Returns:
        list of sentences.
    """
    sentences = re.split(regex, text)
    puncts = re.findall(regex, text)

    for i, sentence in enumerate(sentences):
        if sentence == "":
            continue
        if i < len(puncts):
            sentences[i] = sentences[i] + puncts[i]
        else:
            sentences[i] = sentences[i] + "。"
    sentences = [i for i in sentences if i != ""]
    return sentences


def concat_audio(audio_list, sampling_rate=22050, silence_duration=1000):
    """Concatenate audio files and insert silence between them.

    Args:
        audio_list: list of audio files.
        sampling_rate: audio sampling rate. Defaults to 22050.
        silence_duration: silence duration in miliseconds. Defaults to 1000.

    Returns:
        concatenated audio.
    """
    silence_samples = int(sampling_rate * silence_duration / 1000)
    silence = np.zeros(silence_samples, dtype=np.float16)

    audio_num = len(audio_list)
    if audio_num < 2:
        return audio_list[0]
    audio_cat = audio_list[0]
    for i in range(1, audio_num):
        audio_cat = np.concatenate((audio_cat, silence, audio_list[i]), axis=0)

    return audio_cat


### 外部TTS的超参数
microsoft_url = "https://japaneast.tts.speech.microsoft.com/cognitiveservices/v1"
microsoft_headers = {'Content-Type': 'application/json; charset=utf-8', 
'Ocp-Apim-Subscription-Key':'1f1ef0ce53b84261be94fab81df7e628'}
microsoft_model_list = [
    "ja-JP-NanamiNeural", 
    "ja-JP-KeitaNeural", 
    "ja-JP-AoiNeural",
    "ja-JP-DaichiNeural",
    "ja-JP-MayuNeural",
    "ja-JP-NaokiNeural",
    "ja-JP-ShioriNeural"
]

google_url = "http://gbu.jp02-a30-apisix-sandbox.baidu-int.com/gbu/rest/v2/tts/voice_gq"
google_headers = {'Content-Type': 'application/json; charset=utf-8', 
'apikey':'synclub-2383kjhjksxfv.2341gs'}
google_model_list = [
    "ja-JP-Neural2-B",
    "ja-JP-Neural2-C",
    "ja-JP-Neural2-D",
    "ja-JP-Standard-A",
    "ja-JP-Standard-B",
    "ja-JP-Standard-C",
    "ja-JP-Standard-D",
    "ja-JP-Wavenet-A",
    "ja-JP-Wavenet-B",
    "ja-JP-Wavenet-C",
    "ja-JP-Wavenet-D"
]

coefont_url = "http://gbu.jp02-a30-apisix-sandbox.baidu-int.com/gbu/rest/v2/tts/avatar_coe"
coefont_headers = {'Content-Type': 'application/json; charset=utf-8', 
'apikey':'synclub-2383kjhjksxfv.2341gs'}
coefont_id = [
    '3f84b7b1-30fb-4677-a704-fd136515303e',
    '9b826785-bea5-4740-b4cd-e9a286264705',
    '7632cba3-4aca-4cee-9d15-ad1ac31f670c',
    '2c91238a-96f9-4cb6-a69a-461ee66b0e6d',
    '08428dee-65b6-490e-a3a3-60dfcdda889d',
    'c88367bc-5954-426b-a1ba-a683202803c8',
    'fb64a764-91d5-4510-bddd-70df3d62709a',
    '5cfa1f33-bca8-4489-bcbe-701045993162',
    '94cf7792-7c0c-4be4-88e7-c30d26ab6616',
    '81dbd387-6ad6-4b22-93f9-4e2a0091b2fe',
    '931a8568-039a-4cef-add7-bee71629c00e',
    'f91a9d29-c8b4-443f-ba07-82e7e36bd20b',
    '23c76cf0-bee0-47fa-b735-9b7bdba9f26a',
    'cf5fdfb8-85ea-41e1-915b-257936791f17',
    '0f7b53df-3c24-46a5-84d1-cbea39a956c0',
    '3d499385-d331-4cbb-93c0-2057e60eddcf',
    '18ca2f7b-97ca-486d-8f47-858965833642',
    '33e0a2ff-5050-434c-9506-defe97e52f15',
    '516b0f32-8b5f-48c5-b60e-38d508e2b06b',
    'c8720caf-2d2d-4130-8831-92f61f9e25e8',
    '710001f5-e6f5-4cc0-8ba2-e6aa6da8d807',
    'd36f8bb1-8bd8-4e90-964a-9dbd3e374093',
    '2157796c-fe48-4688-b7cc-7ea554edf77d',
    '5cc0dc91-0c6a-4c50-b7d8-f3117cfe44ef',
    'be5c5295-aba2-4055-a9da-8926da7fb5a0',
    '76763239-af14-4c0d-9435-956f096f77dc',
    '10d298ee-ebbf-4838-a6c5-d608f2e3c338',
    '694cb06e-73bd-43c4-94d4-f775ad3dbb26',
    '5cf07e7c-5b1c-4360-a8de-7c928580d4b5',
    '76e2ba06-b23a-4bbe-8148-e30ede9001b9',
    'c25ed97f-78f7-4e8f-b2fa-f8e29633588b',
    'e26382ba-2ae2-4cf7-8c1b-420ab4b845d8',
    '82c4fcf5-d0ee-4fe9-9b0d-89a65d04f290'
 ]
coefont_model_list = [
    'Canel',
    '胡麻ちゃん',
    'バーチャル悪霊',
    '引寄\u3000法則',
    'にっし~☆',
    '志水 智(Tomo Shimizu)',
    '花撫シア-最高精度-しっかり読み上げ',
    'UNF/UserNotFound',
    'RoBaKu',
    'おにもち',
    '小菅 将太',
    '秋月つむぎ(落ち着いたナレーション)',
    '碧海紘斗_OhmiHiroto',
    'ちくわぶえ',
    'unnamed',
    '今井瑶子(高精度。MC ナレーター 落ち着いたトーンです)',
    '皆のお母さん',
    '後藤邑子',
    '田中和彦',
    'KTNR',
    '天渡\u3000早苗',
    '須戸ゼロ',
    'とり藻々',
    '武田 祐子',
    '【PRO】落ち着きナレーション♯畑耕平',
    '音暖ののん Ver2.0(最高精度)',
    'ろさちゃん-soft-v2[最高精度] ¦ Losa-chan -soft- ∀ -汎用式概念χ',
    'パイナップル秀夫お姉さん',
    'minamo',
    'あさのゆき',
    '聲華 琴音【紡】',
    '黄琴海月【うるとら】',
    '高橋 俊輔']
coefont_id_model_name_dict = dict(zip(coefont_model_list, coefont_id))

all_example = "今日は天気がいいから、一緒にハイキングに行きましょう。"

# def audio_postprocess(self, y):
#     """
#     修改gr的音频后处理函数
#     :param self:
#     :param y:
#     :return:
#     """
#     if y is None:
#         return None

#     if gr_utils.validate_url(y):
#         file = gr_processing_utils.download_to_file(y, dir=self.temp_dir)
#     elif isinstance(y, tuple):
#         sample_rate, data = y
#         file = tempfile.NamedTemporaryFile(
#             suffix=".wav", dir=self.temp_dir, delete=False
#         )
#         gr_processing_utils.audio_to_file(sample_rate, data, file.name)
#     else:
#         file = gr_processing_utils.create_tmp_copy_of_file(y, dir=self.temp_dir)

#     return gr_processing_utils.encode_url_or_file_to_base64(file.name)

# gr.Audio.postprocess = audio_postprocess

def get_text(text, hps):
    """
    :param text:
    :param hps:
    :param is_symbol:
    :return:
    """
    # hps中没有包括symbols
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    # hps中有包括symbols
    # text_norm = text_to_sequence_for_test(text, hps.symbols, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = LongTensor(text_norm)
    return text_norm


def create_tts_fn(model, hps):
    """
    :param model:
    :param hps:
    :param speaker_ids:
    :return:
    """
    def tts_fn(text, speed, noise_scale=.667, noise_scale_w=0.8, volume=1.0):
        """
        :param text:
        :param speaker:
        :param speed:
        :param emo:
        :param volume:
        :param is_symbol:
        :return:
        """
        sentences = split_text(text, punct_regex)
        audio_list = []
        for sentence in sentences:
            stn_tst = get_text(sentence, hps)
            with no_grad():
                x_tst = stn_tst.unsqueeze(0).to(device)
                x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
                audio = model.infer(x_tst, x_tst_lengths, noise_scale=noise_scale, noise_scale_w=noise_scale_w,
                                    length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
                audio_list.append(audio)
            del stn_tst, x_tst, x_tst_lengths
        audio = concat_audio(audio_list, hps.data.sampling_rate, silence_duration)
        audio = audio * volume
        return "Success", (hps.data.sampling_rate, audio)
    return tts_fn


def microsoft(text, name, style="Neural"):
    """
    :param text:
    :param name:
    :param style:
    :return:
    """
    headers = {
        'Ocp-Apim-Subscription-Key': '1f1ef0ce53b84261be94fab81df7e628',
        'Content-Type': 'application/ssml+xml',
        'X-Microsoft-OutputFormat': 'audio-16khz-128kbitrate-mono-mp3',
        'User-Agent': 'curl',
    }

    data = ("<speak version='1.0' xml:lang='en-US'>"
        f"<voice xml:lang='en-US' name='{name}'>" # xml:gender='Female' 
        f"{text}"
        "</voice>"
        "</speak>")

    response = requests.post(
        'https://japaneast.tts.speech.microsoft.com/cognitiveservices/v1',
        headers=headers,
        data=data,
        proxies= {
            'http': 'http://192.168.3.11:80',
            'https': 'http://192.168.3.11:80',
        }
    )
    data = {
        "text":text,
        "name":name,
        "style":style,
        "format":"mp3"}
    audio_url = requests.get(microsoft_url, headers=microsoft_headers, json=data).json()['data']['url']
    return "Success", audio_url
    

def google(text, name):
    """
    :param text:
    :param name:
    :param style:
    :return:
    """
    data = {
        "text":text,
        "name":name,
        "sample_rate":16000}
    audio_url = requests.get(google_url, headers=google_headers, json=data).json()['data']['url']
    return "Success", audio_url


def coefont(text, name):
    """
    :param text:
    :param name:
    :param style:
    :return:
    """
    data = {
        "text":text,
        "coefont":coefont_id_model_name_dict[name]
        }
    audio_url = requests.get(coefont_url, headers=coefont_headers, json=data).json()['data']['url']
    return "Success", audio_url


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--device', type=str, default='cuda')
    parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
    parser.add_argument("--port", type=int, default=8080, help="port")
    parser.add_argument('--model_info_path', type=str, default='/gluster/speech_data/info.json')
    args = parser.parse_args()

    device = torch.device(args.device)
    models_tts = []

    with open(args.model_info_path, "r", encoding="utf-8") as f:
        models_info = json.load(f)
    for i, info in models_info.items():
        model_name = info["model_name"]
        author = info["author"]
        lang = info["lang"]
        example = info["example"]
        config_path = info["config_path"]
        model_path = info["model_path"]
        model_type = info["model_type"]

        hps = utils.get_hparams_from_file(config_path)
        if model_type == "vits":
            emotion_type = None
        elif model_type == "vits-emotion":
            emotion_type = "embedding"
        elif model_type == "vits-emotion-logits":
            emotion_type = "logits"

        model = SynthesizerTrn(
            len(symbols),
            hps.data.filter_length // 2 + 1,
            hps.train.segment_size // hps.data.hop_length,
            emotion_type=emotion_type,
            **hps.model)

        utils.load_checkpoint(model_path, model, None)
        model.eval().to(device)
        if model_type == "vits":
            # 普通TTS
            models_tts.append((model_name, author, lang, example, create_tts_fn(model, hps)))
    app = gr.Blocks()
    with app:
        gr.Markdown("## Japanese TTS Demo")
        with gr.Tabs():
            with gr.TabItem("自研"):
                with gr.Tabs():
                    for i, (model_name, author, lang, example, tts_fn) in enumerate(models_tts):
                        with gr.TabItem(model_name):
                            with gr.Column():
                                tts_input1 = gr.TextArea(label="Text", value=example)
                                tts_input2 = gr.Slider(label="Speed", value=1.0, minimum=0.4, maximum=3, step=0.1)
                                tts_input3 = gr.Slider(label="noise_scale", value=0.0, minimum=0.0, maximum=2, step=0.1)
                                tts_input4 = gr.Slider(label="noise_scale_w", value=0.0,
                                                       minimum=0.0, maximum=2, step=0.1)
                                tts_input5 = gr.Slider(label="volume", value=1.0, minimum=0.1, maximum=4, step=0.1)                                
                                tts_submit = gr.Button("Generate", variant="primary")
                                tts_output1 = gr.Textbox(label="Output Message")
                                tts_output2 = gr.Audio(label="Output Audio")
                                tts_submit.click(tts_fn, [tts_input1, tts_input2, tts_input3, tts_input4, tts_input5],
                                                 [tts_output1, tts_output2])
            
            with gr.TabItem("谷歌"):
                tts_input1 = gr.TextArea(label="Text", value=all_example)
                tts_input2 = gr.Dropdown(google_model_list, label="name")                             
                tts_submit = gr.Button("Generate", variant="primary")
                tts_output1 = gr.Textbox(label="Output Message")
                tts_output2 = gr.Audio(label="Output Audio")
                tts_submit.click(google, [tts_input1, tts_input2],
                                    [tts_output1, tts_output2])

            with gr.TabItem("微软"):
                tts_input1 = gr.TextArea(label="Text", value=all_example)
                tts_input2 = gr.Dropdown(microsoft_model_list, label="name")                             
                tts_submit = gr.Button("Generate", variant="primary")
                tts_output1 = gr.Textbox(label="Output Message")
                tts_output2 = gr.Audio(label="Output Audio")
                tts_submit.click(microsoft, [tts_input1, tts_input2],
                                    [tts_output1, tts_output2])
                
            with gr.TabItem("coefont"):
                tts_input1 = gr.TextArea(label="Text", value=all_example)
                tts_input2 = gr.Dropdown(coefont_model_list, label="name")
                tts_submit = gr.Button("Generate", variant="primary")
                tts_output1 = gr.Textbox(label="Output Message")
                tts_output2 = gr.Audio(label="Output Audio")
                tts_submit.click(coefont, [tts_input1, tts_input2],
                                    [tts_output1, tts_output2])

    app.queue(concurrency_count=5).launch(show_api=False,
                                          share=args.share,
                                          server_name='0.0.0.0',
                                          server_port=args.port,
                                          show_error=True)