|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
Author: linxiaolong |
|
""" |
|
import warnings |
|
warnings.filterwarnings("ignore") |
|
|
|
|
|
import re |
|
import requests |
|
import argparse |
|
import json |
|
import os |
|
import re |
|
import tempfile |
|
|
|
import librosa |
|
import numpy as np |
|
|
|
|
|
import commons |
|
import gradio as gr |
|
import gradio.utils as gr_utils |
|
import gradio.processing_utils as gr_processing_utils |
|
|
|
|
|
from models import SynthesizerTrn |
|
from text import text_to_sequence, text_to_sequence_for_test, _clean_text |
|
from mel_processing import spectrogram_torch |
|
import utils |
|
from text.symbols import symbols |
|
|
|
limitation = os.getenv("SYSTEM") == "spaces" |
|
punct_regex = re.compile(r"[\.!\?。!?]") |
|
silence_duration = 200 |
|
|
|
|
|
def split_text(text, regex): |
|
"""Split text into sentences by puncutations. |
|
|
|
Args: |
|
text: long text. |
|
regex: puncutation regex. |
|
|
|
Returns: |
|
list of sentences. |
|
""" |
|
sentences = re.split(regex, text) |
|
puncts = re.findall(regex, text) |
|
|
|
for i, sentence in enumerate(sentences): |
|
if sentence == "": |
|
continue |
|
if i < len(puncts): |
|
sentences[i] = sentences[i] + puncts[i] |
|
else: |
|
sentences[i] = sentences[i] + "。" |
|
sentences = [i for i in sentences if i != ""] |
|
return sentences |
|
|
|
|
|
def concat_audio(audio_list, sampling_rate=22050, silence_duration=1000): |
|
"""Concatenate audio files and insert silence between them. |
|
|
|
Args: |
|
audio_list: list of audio files. |
|
sampling_rate: audio sampling rate. Defaults to 22050. |
|
silence_duration: silence duration in miliseconds. Defaults to 1000. |
|
|
|
Returns: |
|
concatenated audio. |
|
""" |
|
silence_samples = int(sampling_rate * silence_duration / 1000) |
|
silence = np.zeros(silence_samples, dtype=np.float16) |
|
|
|
audio_num = len(audio_list) |
|
if audio_num < 2: |
|
return audio_list[0] |
|
audio_cat = audio_list[0] |
|
for i in range(1, audio_num): |
|
audio_cat = np.concatenate((audio_cat, silence, audio_list[i]), axis=0) |
|
|
|
return audio_cat |
|
|
|
|
|
|
|
microsoft_url = "https://japaneast.tts.speech.microsoft.com/cognitiveservices/v1" |
|
microsoft_headers = {'Content-Type': 'application/json; charset=utf-8', |
|
'Ocp-Apim-Subscription-Key':'1f1ef0ce53b84261be94fab81df7e628'} |
|
microsoft_model_list = [ |
|
"ja-JP-NanamiNeural", |
|
"ja-JP-KeitaNeural", |
|
"ja-JP-AoiNeural", |
|
"ja-JP-DaichiNeural", |
|
"ja-JP-MayuNeural", |
|
"ja-JP-NaokiNeural", |
|
"ja-JP-ShioriNeural" |
|
] |
|
|
|
google_url = "http://gbu.jp02-a30-apisix-sandbox.baidu-int.com/gbu/rest/v2/tts/voice_gq" |
|
google_headers = {'Content-Type': 'application/json; charset=utf-8', |
|
'apikey':'synclub-2383kjhjksxfv.2341gs'} |
|
google_model_list = [ |
|
"ja-JP-Neural2-B", |
|
"ja-JP-Neural2-C", |
|
"ja-JP-Neural2-D", |
|
"ja-JP-Standard-A", |
|
"ja-JP-Standard-B", |
|
"ja-JP-Standard-C", |
|
"ja-JP-Standard-D", |
|
"ja-JP-Wavenet-A", |
|
"ja-JP-Wavenet-B", |
|
"ja-JP-Wavenet-C", |
|
"ja-JP-Wavenet-D" |
|
] |
|
|
|
coefont_url = "http://gbu.jp02-a30-apisix-sandbox.baidu-int.com/gbu/rest/v2/tts/avatar_coe" |
|
coefont_headers = {'Content-Type': 'application/json; charset=utf-8', |
|
'apikey':'synclub-2383kjhjksxfv.2341gs'} |
|
coefont_id = [ |
|
'3f84b7b1-30fb-4677-a704-fd136515303e', |
|
'9b826785-bea5-4740-b4cd-e9a286264705', |
|
'7632cba3-4aca-4cee-9d15-ad1ac31f670c', |
|
'2c91238a-96f9-4cb6-a69a-461ee66b0e6d', |
|
'08428dee-65b6-490e-a3a3-60dfcdda889d', |
|
'c88367bc-5954-426b-a1ba-a683202803c8', |
|
'fb64a764-91d5-4510-bddd-70df3d62709a', |
|
'5cfa1f33-bca8-4489-bcbe-701045993162', |
|
'94cf7792-7c0c-4be4-88e7-c30d26ab6616', |
|
'81dbd387-6ad6-4b22-93f9-4e2a0091b2fe', |
|
'931a8568-039a-4cef-add7-bee71629c00e', |
|
'f91a9d29-c8b4-443f-ba07-82e7e36bd20b', |
|
'23c76cf0-bee0-47fa-b735-9b7bdba9f26a', |
|
'cf5fdfb8-85ea-41e1-915b-257936791f17', |
|
'0f7b53df-3c24-46a5-84d1-cbea39a956c0', |
|
'3d499385-d331-4cbb-93c0-2057e60eddcf', |
|
'18ca2f7b-97ca-486d-8f47-858965833642', |
|
'33e0a2ff-5050-434c-9506-defe97e52f15', |
|
'516b0f32-8b5f-48c5-b60e-38d508e2b06b', |
|
'c8720caf-2d2d-4130-8831-92f61f9e25e8', |
|
'710001f5-e6f5-4cc0-8ba2-e6aa6da8d807', |
|
'd36f8bb1-8bd8-4e90-964a-9dbd3e374093', |
|
'2157796c-fe48-4688-b7cc-7ea554edf77d', |
|
'5cc0dc91-0c6a-4c50-b7d8-f3117cfe44ef', |
|
'be5c5295-aba2-4055-a9da-8926da7fb5a0', |
|
'76763239-af14-4c0d-9435-956f096f77dc', |
|
'10d298ee-ebbf-4838-a6c5-d608f2e3c338', |
|
'694cb06e-73bd-43c4-94d4-f775ad3dbb26', |
|
'5cf07e7c-5b1c-4360-a8de-7c928580d4b5', |
|
'76e2ba06-b23a-4bbe-8148-e30ede9001b9', |
|
'c25ed97f-78f7-4e8f-b2fa-f8e29633588b', |
|
'e26382ba-2ae2-4cf7-8c1b-420ab4b845d8', |
|
'82c4fcf5-d0ee-4fe9-9b0d-89a65d04f290' |
|
] |
|
coefont_model_list = [ |
|
'Canel', |
|
'胡麻ちゃん', |
|
'バーチャル悪霊', |
|
'引寄\u3000法則', |
|
'にっし~☆', |
|
'志水 智(Tomo Shimizu)', |
|
'花撫シア-最高精度-しっかり読み上げ', |
|
'UNF/UserNotFound', |
|
'RoBaKu', |
|
'おにもち', |
|
'小菅 将太', |
|
'秋月つむぎ(落ち着いたナレーション)', |
|
'碧海紘斗_OhmiHiroto', |
|
'ちくわぶえ', |
|
'unnamed', |
|
'今井瑶子(高精度。MC ナレーター 落ち着いたトーンです)', |
|
'皆のお母さん', |
|
'後藤邑子', |
|
'田中和彦', |
|
'KTNR', |
|
'天渡\u3000早苗', |
|
'須戸ゼロ', |
|
'とり藻々', |
|
'武田 祐子', |
|
'【PRO】落ち着きナレーション♯畑耕平', |
|
'音暖ののん Ver2.0(最高精度)', |
|
'ろさちゃん-soft-v2[最高精度] ¦ Losa-chan -soft- ∀ -汎用式概念χ', |
|
'パイナップル秀夫お姉さん', |
|
'minamo', |
|
'あさのゆき', |
|
'聲華 琴音【紡】', |
|
'黄琴海月【うるとら】', |
|
'高橋 俊輔'] |
|
coefont_id_model_name_dict = dict(zip(coefont_model_list, coefont_id)) |
|
|
|
all_example = "今日は天気がいいから、一緒にハイキングに行きましょう。" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_text(text, hps): |
|
""" |
|
:param text: |
|
:param hps: |
|
:param is_symbol: |
|
:return: |
|
""" |
|
|
|
text_norm = text_to_sequence(text, hps.data.text_cleaners) |
|
|
|
|
|
if hps.data.add_blank: |
|
text_norm = commons.intersperse(text_norm, 0) |
|
text_norm = LongTensor(text_norm) |
|
return text_norm |
|
|
|
|
|
def create_tts_fn(model, hps): |
|
""" |
|
:param model: |
|
:param hps: |
|
:param speaker_ids: |
|
:return: |
|
""" |
|
def tts_fn(text, speed, noise_scale=.667, noise_scale_w=0.8, volume=1.0): |
|
""" |
|
:param text: |
|
:param speaker: |
|
:param speed: |
|
:param emo: |
|
:param volume: |
|
:param is_symbol: |
|
:return: |
|
""" |
|
sentences = split_text(text, punct_regex) |
|
audio_list = [] |
|
for sentence in sentences: |
|
stn_tst = get_text(sentence, hps) |
|
with no_grad(): |
|
x_tst = stn_tst.unsqueeze(0).to(device) |
|
x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device) |
|
audio = model.infer(x_tst, x_tst_lengths, noise_scale=noise_scale, noise_scale_w=noise_scale_w, |
|
length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy() |
|
audio_list.append(audio) |
|
del stn_tst, x_tst, x_tst_lengths |
|
audio = concat_audio(audio_list, hps.data.sampling_rate, silence_duration) |
|
audio = audio * volume |
|
return "Success", (hps.data.sampling_rate, audio) |
|
return tts_fn |
|
|
|
|
|
def microsoft(text, name, style="Neural"): |
|
""" |
|
:param text: |
|
:param name: |
|
:param style: |
|
:return: |
|
""" |
|
headers = { |
|
'Ocp-Apim-Subscription-Key': '1f1ef0ce53b84261be94fab81df7e628', |
|
'Content-Type': 'application/ssml+xml', |
|
'X-Microsoft-OutputFormat': 'audio-16khz-128kbitrate-mono-mp3', |
|
'User-Agent': 'curl', |
|
} |
|
|
|
data = ("<speak version='1.0' xml:lang='en-US'>" |
|
f"<voice xml:lang='en-US' name='{name}'>" |
|
f"{text}" |
|
"</voice>" |
|
"</speak>") |
|
|
|
response = requests.post( |
|
'https://japaneast.tts.speech.microsoft.com/cognitiveservices/v1', |
|
headers=headers, |
|
data=data, |
|
proxies= { |
|
'http': 'http://192.168.3.11:80', |
|
'https': 'http://192.168.3.11:80', |
|
} |
|
) |
|
data = { |
|
"text":text, |
|
"name":name, |
|
"style":style, |
|
"format":"mp3"} |
|
audio_url = requests.get(microsoft_url, headers=microsoft_headers, json=data).json()['data']['url'] |
|
return "Success", audio_url |
|
|
|
|
|
def google(text, name): |
|
""" |
|
:param text: |
|
:param name: |
|
:param style: |
|
:return: |
|
""" |
|
data = { |
|
"text":text, |
|
"name":name, |
|
"sample_rate":16000} |
|
audio_url = requests.get(google_url, headers=google_headers, json=data).json()['data']['url'] |
|
return "Success", audio_url |
|
|
|
|
|
def coefont(text, name): |
|
""" |
|
:param text: |
|
:param name: |
|
:param style: |
|
:return: |
|
""" |
|
data = { |
|
"text":text, |
|
"coefont":coefont_id_model_name_dict[name] |
|
} |
|
audio_url = requests.get(coefont_url, headers=coefont_headers, json=data).json()['data']['url'] |
|
return "Success", audio_url |
|
|
|
|
|
if __name__ == '__main__': |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument('--device', type=str, default='cuda') |
|
parser.add_argument("--share", action="store_true", default=False, help="share gradio app") |
|
parser.add_argument("--port", type=int, default=8080, help="port") |
|
parser.add_argument('--model_info_path', type=str, default='/gluster/speech_data/info.json') |
|
args = parser.parse_args() |
|
|
|
device = torch.device(args.device) |
|
models_tts = [] |
|
|
|
with open(args.model_info_path, "r", encoding="utf-8") as f: |
|
models_info = json.load(f) |
|
for i, info in models_info.items(): |
|
model_name = info["model_name"] |
|
author = info["author"] |
|
lang = info["lang"] |
|
example = info["example"] |
|
config_path = info["config_path"] |
|
model_path = info["model_path"] |
|
model_type = info["model_type"] |
|
|
|
hps = utils.get_hparams_from_file(config_path) |
|
if model_type == "vits": |
|
emotion_type = None |
|
elif model_type == "vits-emotion": |
|
emotion_type = "embedding" |
|
elif model_type == "vits-emotion-logits": |
|
emotion_type = "logits" |
|
|
|
model = SynthesizerTrn( |
|
len(symbols), |
|
hps.data.filter_length // 2 + 1, |
|
hps.train.segment_size // hps.data.hop_length, |
|
emotion_type=emotion_type, |
|
**hps.model) |
|
|
|
utils.load_checkpoint(model_path, model, None) |
|
model.eval().to(device) |
|
if model_type == "vits": |
|
|
|
models_tts.append((model_name, author, lang, example, create_tts_fn(model, hps))) |
|
app = gr.Blocks() |
|
with app: |
|
gr.Markdown("## Japanese TTS Demo") |
|
with gr.Tabs(): |
|
with gr.TabItem("自研"): |
|
with gr.Tabs(): |
|
for i, (model_name, author, lang, example, tts_fn) in enumerate(models_tts): |
|
with gr.TabItem(model_name): |
|
with gr.Column(): |
|
tts_input1 = gr.TextArea(label="Text", value=example) |
|
tts_input2 = gr.Slider(label="Speed", value=1.0, minimum=0.4, maximum=3, step=0.1) |
|
tts_input3 = gr.Slider(label="noise_scale", value=0.0, minimum=0.0, maximum=2, step=0.1) |
|
tts_input4 = gr.Slider(label="noise_scale_w", value=0.0, |
|
minimum=0.0, maximum=2, step=0.1) |
|
tts_input5 = gr.Slider(label="volume", value=1.0, minimum=0.1, maximum=4, step=0.1) |
|
tts_submit = gr.Button("Generate", variant="primary") |
|
tts_output1 = gr.Textbox(label="Output Message") |
|
tts_output2 = gr.Audio(label="Output Audio") |
|
tts_submit.click(tts_fn, [tts_input1, tts_input2, tts_input3, tts_input4, tts_input5], |
|
[tts_output1, tts_output2]) |
|
|
|
with gr.TabItem("谷歌"): |
|
tts_input1 = gr.TextArea(label="Text", value=all_example) |
|
tts_input2 = gr.Dropdown(google_model_list, label="name") |
|
tts_submit = gr.Button("Generate", variant="primary") |
|
tts_output1 = gr.Textbox(label="Output Message") |
|
tts_output2 = gr.Audio(label="Output Audio") |
|
tts_submit.click(google, [tts_input1, tts_input2], |
|
[tts_output1, tts_output2]) |
|
|
|
with gr.TabItem("微软"): |
|
tts_input1 = gr.TextArea(label="Text", value=all_example) |
|
tts_input2 = gr.Dropdown(microsoft_model_list, label="name") |
|
tts_submit = gr.Button("Generate", variant="primary") |
|
tts_output1 = gr.Textbox(label="Output Message") |
|
tts_output2 = gr.Audio(label="Output Audio") |
|
tts_submit.click(microsoft, [tts_input1, tts_input2], |
|
[tts_output1, tts_output2]) |
|
|
|
with gr.TabItem("coefont"): |
|
tts_input1 = gr.TextArea(label="Text", value=all_example) |
|
tts_input2 = gr.Dropdown(coefont_model_list, label="name") |
|
tts_submit = gr.Button("Generate", variant="primary") |
|
tts_output1 = gr.Textbox(label="Output Message") |
|
tts_output2 = gr.Audio(label="Output Audio") |
|
tts_submit.click(coefont, [tts_input1, tts_input2], |
|
[tts_output1, tts_output2]) |
|
|
|
app.queue(concurrency_count=5).launch(show_api=False, |
|
share=args.share, |
|
server_name='0.0.0.0', |
|
server_port=args.port, |
|
show_error=True) |
|
|
|
|