# flake8: noqa: E402 import os import logging import re import re_matching from tools.sentence import split_by_language, sentence_split logging.getLogger("numba").setLevel(logging.WARNING) logging.getLogger("markdown_it").setLevel(logging.WARNING) logging.getLogger("urllib3").setLevel(logging.WARNING) logging.getLogger("matplotlib").setLevel(logging.WARNING) logging.basicConfig( level=logging.INFO, format="| %(name)s | %(levelname)s | %(message)s" ) logger = logging.getLogger(__name__) import soundfile as sf import torch import utils from infer import infer, latest_version, get_net_g import gradio as gr import webbrowser import numpy as np from config import config from tools.translate import translate net_g = None device = config.webui_config.device if device == "mps": os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" def convert_wav_to_ogg(wav_file): os.makedirs('out', exist_ok=True) filename = os.path.splitext(os.path.basename(wav_file.name))[0] output_path_ogg = os.path.join('out', f"out.ogg") renamed_input_path = os.path.join('in', f"in.wav") os.makedirs('in', exist_ok=True) os.rename(wav_file.name, renamed_input_path) command = ["ffmpeg", "-i", renamed_input_path, "-acodec", "libopus", "-y", output_path_ogg] os.system(" ".join(command)) return output_path_ogg def merge_sentences(sentences, max_length=50): sentences = [s for s in sentences if s.strip()] result = [] temp_sentence = "" for sentence in sentences: if len(temp_sentence) + len(sentence) > max_length: result.append(temp_sentence) temp_sentence = sentence else: temp_sentence += sentence if temp_sentence: result.append(temp_sentence) result = [s for s in result if s.strip()] return result def generate_audio( slices, sdp_ratio, noise_scale, noise_scale_w, length_scale, speaker, language, ): audio_list = [] silence = np.zeros(hps.data.sampling_rate // 2, dtype=np.int16) with torch.no_grad(): for piece in slices: audio = infer( piece, sdp_ratio=sdp_ratio, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale, sid=speaker, language=language, hps=hps, net_g=net_g, device=device, ) audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio) audio_list.append(audio16bit) audio_list.append(silence) # 将静音添加到列表中 return audio_list def tts_split( text: str, # speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale, language, cut_by_sent, interval_between_para, interval_between_sent, ): if language == "mix": return ("invalid", None) while text.find("\n\n") != -1: text = text.replace("\n\n", "\n") para_list = re_matching.cut_para(text) audio_list = [] if not cut_by_sent: for p in para_list: audio = infer( p, sdp_ratio=sdp_ratio, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale, sid="ign", language=language, hps=hps, net_g=net_g, device=device, ) audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio) audio_list.append(audio16bit) silence = np.zeros((int)(44100 * interval_between_para), dtype=np.int16) audio_list.append(silence) else: for p in para_list: audio_list_sent = [] sent_list = re_matching.cut_sent(p) sent_list = merge_sentences(sent_list) for s in sent_list: print(s) audio = infer( s, sdp_ratio=sdp_ratio, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale, sid="ign", language=language, hps=hps, net_g=net_g, device=device, ) audio_list_sent.append(audio) silence = np.zeros((int)(44100 * interval_between_sent)) audio_list_sent.append(silence) if (interval_between_para - interval_between_sent) > 0: silence = np.zeros( (int)(44100 * (interval_between_para - interval_between_sent)) ) audio_list_sent.append(silence) audio_list=audio_list_sent audio_concat = np.concatenate(audio_list) sf.write("tmp.wav", audio_concat, 44100) with open('tmp.wav', 'rb') as wav_file: newogg = convert_wav_to_ogg(wav_file) return ("Success", (44100, audio_concat),newogg) def tts_fn( text: str, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale, language, ): audio_list = [] if language == "mix": bool_valid, str_valid = re_matching.validate_text(text) if not bool_valid: return str_valid, ( hps.data.sampling_rate, np.concatenate([np.zeros(hps.data.sampling_rate // 2)]), ) result = re_matching.text_matching(text) for one in result: _speaker = one.pop() for lang, content in one: audio_list.extend( generate_audio( content.split("|"), sdp_ratio, noise_scale, noise_scale_w, length_scale, _speaker, lang, ) ) elif language.lower() == "auto": sentences_list = split_by_language(text, target_languages=["zh", "ja", "en"]) for sentences, lang in sentences_list: lang = lang.upper() if lang == "JA": lang = "JP" sentences = sentence_split(sentences, max=250) for content in sentences: audio_list.extend( generate_audio( content.split("|"), sdp_ratio, noise_scale, noise_scale_w, length_scale, speaker, lang, ) ) else: audio_list.extend( generate_audio( text.split("|"), sdp_ratio, noise_scale, noise_scale_w, length_scale, speaker, language, ) ) audio_concat = np.concatenate(audio_list) sf.write("tmp.wav", audio_concat, 44100) with open('tmp.wav', 'rb') as wav_file: newogg = convert_wav_to_ogg(wav_file) return ("Success", (44100, audio_concat),newogg) if __name__ == "__main__": if config.webui_config.debug: logger.info("Enable DEBUG-LEVEL log") logging.basicConfig(level=logging.DEBUG) hps = utils.get_hparams_from_file("config.json") # 若config.json中未指定版本则默认为最新版本 version = hps.version if hasattr(hps, "version") else latest_version net_g = get_net_g( model_path="models/xt_dmc.pth", version=version, device=device, hps=hps ) speaker_ids = hps.data.spk2id speakers = list(speaker_ids.keys()) languages = [ "ZH","JP","mix", "auto"] with gr.Blocks() as app: with gr.Row(): with gr.Column(): gr.Markdown(value=""" 星瞳2.0-读书(冬牧场) 中日英三语 Bert-Vits2在线语音生成\n 1、模型作者:数字星瞳企划 https://t.me/xingtong25680 \n \n 2、原项目地址:https://github.com/Stardust-minus/Bert-VITS2\n 3、使用此模型进行二创请注明AI生成,以及原项目地址。\n """) language = gr.Dropdown( choices=languages, value=languages[0], label="选择语言(中文/日语/英语)" ) text = gr.TextArea( label="输入文本内容", value="哎,要跟大家讲,就是那个呃,我前段时间呢在家收拾卫生的时候,我突然发现了一本书,然后那个书上它没有写作者,然后我也完全没有印象是什么时候买的了,里面的这个内容我也没见过。我看了一下,我感觉里面那个故事呢非常有意思,有点像是我的同人小书,就是我看的感觉,哎,就是有一种眼前一亮的感觉。但是想想也挺奇怪的,因为毕竟是我的家,我就想难道是我搬家的时候是谁辣在我家的吗?工具人来帮我搬家的时候辣的,我感觉像谁故意留在我家的。所以呢星期五我准备给大家完整的讲一讲这个故事,你们愿意来听吗?应该会是一个蛮奇妙的故事吧。如果大家愿意来的话,那星期五的直播大家要早点来哦,一定要早点来,因为来晚了,可能你从中间或者是有一段没看,你这故事你就听不懂啦,答应我好吗?星期五早点来哦,八点直播的时候,大家八点就要来哦。那这样的话就要跟小星星们说晚安了,我们星期五见面好吗?", placeholder=""" 如果你选择语言为\'mix\',必须按照格式输入,否则报错: 格式举例(zh是中文,jp是日语,不区分大小写;说话人举例:gongzi): [说话人1]你好,こんにちは! こんにちは,世界。 [说话人2]你好吗?元気ですか? [说话人3]谢谢。どういたしまして。 ... 另外,所有的语言选项都可以用'|'分割长段实现分句生成。 """, ) opt_cut_by_sent = gr.Checkbox( value=True, label="按句号切分" ) interval_between_sent = gr.Slider( minimum=0, maximum=5, value=0.13, step=0.01, label="每句话之间的停顿", ) interval_between_para = gr.Slider( minimum=0, maximum=10, value=0.13, step=0.01, label="每段话之间的停顿", ) # speaker = gr.Dropdown( # choices=speakers, value=speakers[0], label="选择说话人" # ) sdp_ratio = gr.Slider( minimum=0, maximum=1, value=0.2, step=0.01, label="SDP/DP混合比" ) noise_scale = gr.Slider( minimum=0.1, maximum=2, value=0.6, step=0.01, label="感情" ) noise_scale_w = gr.Slider( minimum=0.1, maximum=2, value=0.8, step=0.01, label="音素长度" ) length_scale = gr.Slider( minimum=0.1, maximum=2, value=1.0, step=0.01, label="语速" ) btn = gr.Button("生成音频!", variant="primary") with gr.Column(): with gr.Row(): with gr.Column(): text_output = gr.Textbox(label="状态信息") audio_output = gr.Audio(label="输出音频") ogg_output = gr.File(label="Converted OGG file") gr.Markdown(value=""" 模型汇总:\n 星瞳整合 https://huggingface.co/spaces/digitalxingtong/Xingtong-All-in-One\n 甜甜叫花鸡 https://huggingface.co/spaces/digitalxingtong/Jiaohuaji-Bert-Vits2 \n 七海 https://huggingface.co/spaces/digitalxingtong/Nanami-Bert-Vits2 \n 东雪莲 https://huggingface.co/spaces/digitalxingtong/Azuma-Bert-Vits2 \n 嘉然 https://huggingface.co/spaces/digitalxingtong/Jiaran-Bert-Vits2 \n 乃琳 https://huggingface.co/spaces/digitalxingtong/Eileen-Bert-Vits2 \n 恬豆 https://huggingface.co/spaces/digitalxingtong/Dou-Bert-Vits2 \n 奶绿 杂谈 https://huggingface.co/spaces/digitalxingtong/Nailv-Bert-Vits2 \n 奶绿 朗读 https://huggingface.co/spaces/digitalxingtong/Nailv-read-Bert-Vits2 \n 露早 https://huggingface.co/spaces/digitalxingtong/Luzao-Bert-Vits2 \n 柚恩 https://huggingface.co/spaces/digitalxingtong/Un-Bert-Vits2 \n 米诺 https://huggingface.co/spaces/digitalxingtong/Minuo-Bert-Vits2 \n 扇宝 https://huggingface.co/spaces/digitalxingtong/Shanbao-Bert-Vits2 \n 牧牧白 https://huggingface.co/spaces/digitalxingtong/Miiu-Bert-Vits2 \n 吉诺儿kino https://huggingface.co/spaces/digitalxingtong/Kino-Bert-Vits2 \n 九夏 https://huggingface.co/spaces/digitalxingtong/Jiuxia-Bert-Vits2 \n 卡缇娅 https://huggingface.co/spaces/digitalxingtong/Yaya-Bert-Vits2 \n 理想_ideal https://huggingface.co/spaces/digitalxingtong/Lixiang-Bert-Vits2 \n 阿梓 https://huggingface.co/spaces/digitalxingtong/Azusa-Bert-Vits2 \n 鹿鸣 https://huggingface.co/spaces/digitalxingtong/Luming-Bert-Vits2 \n 永雏塔菲 https://huggingface.co/spaces/digitalxingtong/Taffy-Bert-VITS2 \n """) btn.click( tts_split, inputs=[ text, # "ign", sdp_ratio, noise_scale, noise_scale_w, length_scale, language, opt_cut_by_sent, interval_between_para, interval_between_sent, ], outputs=[text_output, audio_output,ogg_output], ) print("推理页面已开启!") webbrowser.open(f"http://127.0.0.1:{config.webui_config.port}") app.launch(share=config.webui_config.share, server_port=config.webui_config.port)