#!/usr/bin/env python3 # -*- coding=utf8 -*- ######################################################################## # # Copyright (c) 2023 Baidu.com, Inc. All Rights Reserved # ######################################################################## """ Author: linxiaolong """ import warnings warnings.filterwarnings("ignore") # 外部库 import re import requests import argparse import json import os import re import tempfile import librosa import numpy as np # import torch # from torch import no_grad, LongTensor import commons import gradio as gr import gradio.utils as gr_utils import gradio.processing_utils as gr_processing_utils # 内部库 from models import SynthesizerTrn from text import text_to_sequence, text_to_sequence_for_test, _clean_text from mel_processing import spectrogram_torch import utils from text.symbols import symbols limitation = os.getenv("SYSTEM") == "spaces" # limit text and audio length in huggingface spaces punct_regex = re.compile(r"[\.!\?。!?]") silence_duration = 200 def split_text(text, regex): """Split text into sentences by puncutations. Args: text: long text. regex: puncutation regex. Returns: list of sentences. """ sentences = re.split(regex, text) puncts = re.findall(regex, text) for i, sentence in enumerate(sentences): if sentence == "": continue if i < len(puncts): sentences[i] = sentences[i] + puncts[i] else: sentences[i] = sentences[i] + "。" sentences = [i for i in sentences if i != ""] return sentences def concat_audio(audio_list, sampling_rate=22050, silence_duration=1000): """Concatenate audio files and insert silence between them. Args: audio_list: list of audio files. sampling_rate: audio sampling rate. Defaults to 22050. silence_duration: silence duration in miliseconds. Defaults to 1000. Returns: concatenated audio. """ silence_samples = int(sampling_rate * silence_duration / 1000) silence = np.zeros(silence_samples, dtype=np.float16) audio_num = len(audio_list) if audio_num < 2: return audio_list[0] audio_cat = audio_list[0] for i in range(1, audio_num): audio_cat = np.concatenate((audio_cat, silence, audio_list[i]), axis=0) return audio_cat ### 外部TTS的超参数 microsoft_url = "https://japaneast.tts.speech.microsoft.com/cognitiveservices/v1" microsoft_headers = {'Content-Type': 'application/json; charset=utf-8', 'Ocp-Apim-Subscription-Key':'1f1ef0ce53b84261be94fab81df7e628'} microsoft_model_list = [ "ja-JP-NanamiNeural", "ja-JP-KeitaNeural", "ja-JP-AoiNeural", "ja-JP-DaichiNeural", "ja-JP-MayuNeural", "ja-JP-NaokiNeural", "ja-JP-ShioriNeural" ] google_url = "http://gbu.jp02-a30-apisix-sandbox.baidu-int.com/gbu/rest/v2/tts/voice_gq" google_headers = {'Content-Type': 'application/json; charset=utf-8', 'apikey':'synclub-2383kjhjksxfv.2341gs'} google_model_list = [ "ja-JP-Neural2-B", "ja-JP-Neural2-C", "ja-JP-Neural2-D", "ja-JP-Standard-A", "ja-JP-Standard-B", "ja-JP-Standard-C", "ja-JP-Standard-D", "ja-JP-Wavenet-A", "ja-JP-Wavenet-B", "ja-JP-Wavenet-C", "ja-JP-Wavenet-D" ] coefont_url = "http://gbu.jp02-a30-apisix-sandbox.baidu-int.com/gbu/rest/v2/tts/avatar_coe" coefont_headers = {'Content-Type': 'application/json; charset=utf-8', 'apikey':'synclub-2383kjhjksxfv.2341gs'} coefont_id = [ '3f84b7b1-30fb-4677-a704-fd136515303e', '9b826785-bea5-4740-b4cd-e9a286264705', '7632cba3-4aca-4cee-9d15-ad1ac31f670c', '2c91238a-96f9-4cb6-a69a-461ee66b0e6d', '08428dee-65b6-490e-a3a3-60dfcdda889d', 'c88367bc-5954-426b-a1ba-a683202803c8', 'fb64a764-91d5-4510-bddd-70df3d62709a', '5cfa1f33-bca8-4489-bcbe-701045993162', '94cf7792-7c0c-4be4-88e7-c30d26ab6616', '81dbd387-6ad6-4b22-93f9-4e2a0091b2fe', '931a8568-039a-4cef-add7-bee71629c00e', 'f91a9d29-c8b4-443f-ba07-82e7e36bd20b', '23c76cf0-bee0-47fa-b735-9b7bdba9f26a', 'cf5fdfb8-85ea-41e1-915b-257936791f17', '0f7b53df-3c24-46a5-84d1-cbea39a956c0', '3d499385-d331-4cbb-93c0-2057e60eddcf', '18ca2f7b-97ca-486d-8f47-858965833642', '33e0a2ff-5050-434c-9506-defe97e52f15', '516b0f32-8b5f-48c5-b60e-38d508e2b06b', 'c8720caf-2d2d-4130-8831-92f61f9e25e8', '710001f5-e6f5-4cc0-8ba2-e6aa6da8d807', 'd36f8bb1-8bd8-4e90-964a-9dbd3e374093', '2157796c-fe48-4688-b7cc-7ea554edf77d', '5cc0dc91-0c6a-4c50-b7d8-f3117cfe44ef', 'be5c5295-aba2-4055-a9da-8926da7fb5a0', '76763239-af14-4c0d-9435-956f096f77dc', '10d298ee-ebbf-4838-a6c5-d608f2e3c338', '694cb06e-73bd-43c4-94d4-f775ad3dbb26', '5cf07e7c-5b1c-4360-a8de-7c928580d4b5', '76e2ba06-b23a-4bbe-8148-e30ede9001b9', 'c25ed97f-78f7-4e8f-b2fa-f8e29633588b', 'e26382ba-2ae2-4cf7-8c1b-420ab4b845d8', '82c4fcf5-d0ee-4fe9-9b0d-89a65d04f290' ] coefont_model_list = [ 'Canel', '胡麻ちゃん', 'バーチャル悪霊', '引寄\u3000法則', 'にっし~☆', '志水 智(Tomo Shimizu)', '花撫シア-最高精度-しっかり読み上げ', 'UNF/UserNotFound', 'RoBaKu', 'おにもち', '小菅 将太', '秋月つむぎ(落ち着いたナレーション)', '碧海紘斗_OhmiHiroto', 'ちくわぶえ', 'unnamed', '今井瑶子(高精度。MC ナレーター 落ち着いたトーンです)', '皆のお母さん', '後藤邑子', '田中和彦', 'KTNR', '天渡\u3000早苗', '須戸ゼロ', 'とり藻々', '武田 祐子', '【PRO】落ち着きナレーション♯畑耕平', '音暖ののん Ver2.0(最高精度)', 'ろさちゃん-soft-v2[最高精度] ¦ Losa-chan -soft- ∀ -汎用式概念χ', 'パイナップル秀夫お姉さん', 'minamo', 'あさのゆき', '聲華 琴音【紡】', '黄琴海月【うるとら】', '高橋 俊輔'] coefont_id_model_name_dict = dict(zip(coefont_model_list, coefont_id)) all_example = "今日は天気がいいから、一緒にハイキングに行きましょう。" # def audio_postprocess(self, y): # """ # 修改gr的音频后处理函数 # :param self: # :param y: # :return: # """ # if y is None: # return None # if gr_utils.validate_url(y): # file = gr_processing_utils.download_to_file(y, dir=self.temp_dir) # elif isinstance(y, tuple): # sample_rate, data = y # file = tempfile.NamedTemporaryFile( # suffix=".wav", dir=self.temp_dir, delete=False # ) # gr_processing_utils.audio_to_file(sample_rate, data, file.name) # else: # file = gr_processing_utils.create_tmp_copy_of_file(y, dir=self.temp_dir) # return gr_processing_utils.encode_url_or_file_to_base64(file.name) # gr.Audio.postprocess = audio_postprocess def get_text(text, hps): """ :param text: :param hps: :param is_symbol: :return: """ # hps中没有包括symbols text_norm = text_to_sequence(text, hps.data.text_cleaners) # hps中有包括symbols # text_norm = text_to_sequence_for_test(text, hps.symbols, hps.data.text_cleaners) if hps.data.add_blank: text_norm = commons.intersperse(text_norm, 0) text_norm = LongTensor(text_norm) return text_norm def create_tts_fn(model, hps): """ :param model: :param hps: :param speaker_ids: :return: """ def tts_fn(text, speed, noise_scale=.667, noise_scale_w=0.8, volume=1.0): """ :param text: :param speaker: :param speed: :param emo: :param volume: :param is_symbol: :return: """ sentences = split_text(text, punct_regex) audio_list = [] for sentence in sentences: stn_tst = get_text(sentence, hps) with no_grad(): x_tst = stn_tst.unsqueeze(0).to(device) x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device) audio = model.infer(x_tst, x_tst_lengths, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy() audio_list.append(audio) del stn_tst, x_tst, x_tst_lengths audio = concat_audio(audio_list, hps.data.sampling_rate, silence_duration) audio = audio * volume return "Success", (hps.data.sampling_rate, audio) return tts_fn def microsoft(text, name, style="Neural"): """ :param text: :param name: :param style: :return: """ headers = { 'Ocp-Apim-Subscription-Key': '1f1ef0ce53b84261be94fab81df7e628', 'Content-Type': 'application/ssml+xml', 'X-Microsoft-OutputFormat': 'audio-16khz-128kbitrate-mono-mp3', 'User-Agent': 'curl', } data = ("" f"" # xml:gender='Female' f"{text}" "" "") response = requests.post( 'https://japaneast.tts.speech.microsoft.com/cognitiveservices/v1', headers=headers, data=data, proxies= { 'http': 'http://192.168.3.11:80', 'https': 'http://192.168.3.11:80', } ) data = { "text":text, "name":name, "style":style, "format":"mp3"} audio_url = requests.get(microsoft_url, headers=microsoft_headers, json=data).json()['data']['url'] return "Success", audio_url def google(text, name): """ :param text: :param name: :param style: :return: """ data = { "text":text, "name":name, "sample_rate":16000} audio_url = requests.get(google_url, headers=google_headers, json=data).json()['data']['url'] return "Success", audio_url def coefont(text, name): """ :param text: :param name: :param style: :return: """ data = { "text":text, "coefont":coefont_id_model_name_dict[name] } audio_url = requests.get(coefont_url, headers=coefont_headers, json=data).json()['data']['url'] return "Success", audio_url if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--device', type=str, default='cuda') parser.add_argument("--share", action="store_true", default=False, help="share gradio app") parser.add_argument("--port", type=int, default=8080, help="port") parser.add_argument('--model_info_path', type=str, default='/gluster/speech_data/info.json') args = parser.parse_args() device = torch.device(args.device) models_tts = [] with open(args.model_info_path, "r", encoding="utf-8") as f: models_info = json.load(f) for i, info in models_info.items(): model_name = info["model_name"] author = info["author"] lang = info["lang"] example = info["example"] config_path = info["config_path"] model_path = info["model_path"] model_type = info["model_type"] hps = utils.get_hparams_from_file(config_path) if model_type == "vits": emotion_type = None elif model_type == "vits-emotion": emotion_type = "embedding" elif model_type == "vits-emotion-logits": emotion_type = "logits" model = SynthesizerTrn( len(symbols), hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, emotion_type=emotion_type, **hps.model) utils.load_checkpoint(model_path, model, None) model.eval().to(device) if model_type == "vits": # 普通TTS models_tts.append((model_name, author, lang, example, create_tts_fn(model, hps))) app = gr.Blocks() with app: gr.Markdown("## Japanese TTS Demo") with gr.Tabs(): with gr.TabItem("自研"): with gr.Tabs(): for i, (model_name, author, lang, example, tts_fn) in enumerate(models_tts): with gr.TabItem(model_name): with gr.Column(): tts_input1 = gr.TextArea(label="Text", value=example) tts_input2 = gr.Slider(label="Speed", value=1.0, minimum=0.4, maximum=3, step=0.1) tts_input3 = gr.Slider(label="noise_scale", value=0.0, minimum=0.0, maximum=2, step=0.1) tts_input4 = gr.Slider(label="noise_scale_w", value=0.0, minimum=0.0, maximum=2, step=0.1) tts_input5 = gr.Slider(label="volume", value=1.0, minimum=0.1, maximum=4, step=0.1) tts_submit = gr.Button("Generate", variant="primary") tts_output1 = gr.Textbox(label="Output Message") tts_output2 = gr.Audio(label="Output Audio") tts_submit.click(tts_fn, [tts_input1, tts_input2, tts_input3, tts_input4, tts_input5], [tts_output1, tts_output2]) with gr.TabItem("谷歌"): tts_input1 = gr.TextArea(label="Text", value=all_example) tts_input2 = gr.Dropdown(google_model_list, label="name") tts_submit = gr.Button("Generate", variant="primary") tts_output1 = gr.Textbox(label="Output Message") tts_output2 = gr.Audio(label="Output Audio") tts_submit.click(google, [tts_input1, tts_input2], [tts_output1, tts_output2]) with gr.TabItem("微软"): tts_input1 = gr.TextArea(label="Text", value=all_example) tts_input2 = gr.Dropdown(microsoft_model_list, label="name") tts_submit = gr.Button("Generate", variant="primary") tts_output1 = gr.Textbox(label="Output Message") tts_output2 = gr.Audio(label="Output Audio") tts_submit.click(microsoft, [tts_input1, tts_input2], [tts_output1, tts_output2]) with gr.TabItem("coefont"): tts_input1 = gr.TextArea(label="Text", value=all_example) tts_input2 = gr.Dropdown(coefont_model_list, label="name") tts_submit = gr.Button("Generate", variant="primary") tts_output1 = gr.Textbox(label="Output Message") tts_output2 = gr.Audio(label="Output Audio") tts_submit.click(coefont, [tts_input1, tts_input2], [tts_output1, tts_output2]) app.queue(concurrency_count=5).launch(show_api=False, share=args.share, server_name='0.0.0.0', server_port=args.port, show_error=True)