# flake8: noqa: E402 import os import logging import re_matching from tools.sentence import split_by_language logging.getLogger("numba").setLevel(logging.WARNING) logging.getLogger("markdown_it").setLevel(logging.WARNING) logging.getLogger("urllib3").setLevel(logging.WARNING) logging.getLogger("matplotlib").setLevel(logging.WARNING) logging.basicConfig( level=logging.INFO, format="| %(name)s | %(levelname)s | %(message)s" ) logger = logging.getLogger(__name__) import torch import ssl ssl._create_default_https_context = ssl._create_unverified_context import nltk nltk.download('cmudict') import utils from infer import infer, latest_version, get_net_g, infer_multilang import gradio as gr import webbrowser import numpy as np from config import config from tools.translate import translate import librosa net_g = None device = config.webui_config.device if device == "mps": os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" def generate_audio( slices, sdp_ratio, noise_scale, noise_scale_w, length_scale, speaker, language, reference_audio, emotion, style_text, style_weight, skip_start=False, skip_end=False, ): audio_list = [] # silence = np.zeros(hps.data.sampling_rate // 2, dtype=np.int16) with torch.no_grad(): for idx, piece in enumerate(slices): skip_start = idx != 0 skip_end = idx != len(slices) - 1 audio = infer( piece, reference_audio=reference_audio, emotion=emotion, sdp_ratio=sdp_ratio, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale, sid=speaker, language=language, hps=hps, net_g=net_g, device=device, skip_start=skip_start, skip_end=skip_end, style_text=style_text, style_weight=style_weight, ) audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio) audio_list.append(audio16bit) return audio_list def generate_audio_multilang( slices, sdp_ratio, noise_scale, noise_scale_w, length_scale, speaker, language, reference_audio, emotion, skip_start=False, skip_end=False, ): audio_list = [] # silence = np.zeros(hps.data.sampling_rate // 2, dtype=np.int16) with torch.no_grad(): for idx, piece in enumerate(slices): skip_start = idx != 0 skip_end = idx != len(slices) - 1 audio = infer_multilang( piece, reference_audio=reference_audio, emotion=emotion, sdp_ratio=sdp_ratio, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale, sid=speaker, language=language[idx], hps=hps, net_g=net_g, device=device, skip_start=skip_start, skip_end=skip_end, ) audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio) audio_list.append(audio16bit) return audio_list def tts_split( text: str, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale, language, cut_by_sent, interval_between_para, interval_between_sent, reference_audio, emotion, style_text, style_weight, ): while text.find("\n\n") != -1: text = text.replace("\n\n", "\n") text = text.replace("|", "") para_list = re_matching.cut_para(text) para_list = [p for p in para_list if p != ""] audio_list = [] for p in para_list: if not cut_by_sent: audio_list += process_text( p, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale, language, reference_audio, emotion, style_text, style_weight, ) silence = np.zeros((int)(44100 * interval_between_para), dtype=np.int16) audio_list.append(silence) else: audio_list_sent = [] sent_list = re_matching.cut_sent(p) sent_list = [s for s in sent_list if s != ""] for s in sent_list: audio_list_sent += process_text( s, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale, language, reference_audio, emotion, style_text, style_weight, ) silence = np.zeros((int)(44100 * interval_between_sent)) audio_list_sent.append(silence) if (interval_between_para - interval_between_sent) > 0: silence = np.zeros( (int)(44100 * (interval_between_para - interval_between_sent)) ) audio_list_sent.append(silence) audio16bit = gr.processing_utils.convert_to_16_bit_wav( np.concatenate(audio_list_sent) ) # 对完整句子做音量归一 audio_list.append(audio16bit) audio_concat = np.concatenate(audio_list) return ("Success", (hps.data.sampling_rate, audio_concat)) def process_mix(slice): _speaker = slice.pop() _text, _lang = [], [] for lang, content in slice: content = content.split("|") content = [part for part in content if part != ""] if len(content) == 0: continue if len(_text) == 0: _text = [[part] for part in content] _lang = [[lang] for part in content] else: _text[-1].append(content[0]) _lang[-1].append(lang) if len(content) > 1: _text += [[part] for part in content[1:]] _lang += [[lang] for part in content[1:]] return _text, _lang, _speaker def process_auto(text): _text, _lang = [], [] for slice in text.split("|"): if slice == "": continue temp_text, temp_lang = [], [] sentences_list = split_by_language(slice, target_languages=["zh", "ja", "en"]) for sentence, lang in sentences_list: if sentence == "": continue temp_text.append(sentence) temp_lang.append(lang.upper()) _text.append(temp_text) _lang.append(temp_lang) return _text, _lang def process_text( text: str, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale, language, reference_audio, emotion, style_text=None, style_weight=0, ): audio_list = [] if language == "mix": bool_valid, str_valid = re_matching.validate_text(text) if not bool_valid: return str_valid, ( hps.data.sampling_rate, np.concatenate([np.zeros(hps.data.sampling_rate // 2)]), ) for slice in re_matching.text_matching(text): _text, _lang, _speaker = process_mix(slice) if _speaker is None: continue print(f"Text: {_text}\nLang: {_lang}") audio_list.extend( generate_audio_multilang( _text, sdp_ratio, noise_scale, noise_scale_w, length_scale, _speaker, _lang, reference_audio, emotion, ) ) elif language.lower() == "auto": _text, _lang = process_auto(text) print(f"Text: {_text}\nLang: {_lang}") _lang = [[lang.replace("JA", "JP") for lang in lang_list] for lang_list in _lang] audio_list.extend( generate_audio_multilang( _text, sdp_ratio, noise_scale, noise_scale_w, length_scale, speaker, _lang, reference_audio, emotion, ) ) else: audio_list.extend( generate_audio( text.split("|"), sdp_ratio, noise_scale, noise_scale_w, length_scale, speaker, language, reference_audio, emotion, style_text, style_weight, ) ) return audio_list def tts_fn( text: str, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale, language, reference_audio, emotion, prompt_mode, style_text=None, style_weight=0, ): if style_text == "": style_text = None if prompt_mode == "Audio prompt": if reference_audio == None: return ("Invalid audio prompt", None) else: reference_audio = load_audio(reference_audio)[1] else: reference_audio = None audio_list = process_text( text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale, language, reference_audio, emotion, style_text, style_weight, ) audio_concat = np.concatenate(audio_list) return "Success", (hps.data.sampling_rate, audio_concat) def format_utils(text, speaker): _text, _lang = process_auto(text) res = f"[{speaker}]" for lang_s, content_s in zip(_lang, _text): for lang, content in zip(lang_s, content_s): res += f"<{lang.lower()}>{content}" res += "|" return "mix", res[:-1] def load_audio(path): audio, sr = librosa.load(path, 48000) # audio = librosa.resample(audio, 44100, 48000) return sr, audio def gr_util(item): if item == "Text prompt": return {"visible": True, "__type__": "update"}, { "visible": False, "__type__": "update", } else: return {"visible": False, "__type__": "update"}, { "visible": True, "__type__": "update", } import json def load_json(file_path): with open(file_path, 'r', encoding="utf-8") as file: data = json.load(file) return data if __name__ == "__main__": if config.webui_config.debug: logger.info("Enable DEBUG-LEVEL log") logging.basicConfig(level=logging.DEBUG) hps = utils.get_hparams_from_file(config.webui_config.config_path) # 若config.json中未指定版本则默认为最新版本 version = hps.version if hasattr(hps, "version") else latest_version net_g = get_net_g( model_path=config.webui_config.model, version=version, device=device, hps=hps ) speaker_ids = hps.data.spk2id speakers = list(speaker_ids.keys()) languages = ["ZH", "JP", "EN", "auto", "mix"] author_and_voice_data = load_json('author_and_voice_data.json') with gr.Blocks() as app: with gr.Row(): with gr.Column(): gr.Markdown(value=f""" 作者:{author_and_voice_data["author"]}\n 聲音歸屬:{author_and_voice_data["voice"]}\n Bert-VITS2項目:https://github.com/fishaudio/Bert-VITS2\n Bert-VITS2-Colab:https://github.com/ADT109119/Bert-VITS2-Colab\n 使用本模型請嚴格遵守法規! \n 發布二創作品請標註本計畫作者及連結、作品使用Bert-VITS2 AI生成! \n 【提示】手機端容易誤觸調節,請刷新恢復預設! 每次產生的結果都不一樣,效果不好請嘗試多次產生與調節,選擇最佳結果! \n """) text = gr.TextArea( label="輸入文本內容", placeholder=""" 推薦不同語言分開推理,因為無法連貫且可能影響最終效果! 若選擇語言為\'mix\',必須依照格式輸入,否則報錯: 格式舉例(zh是中文,jp是日語,en是英語;不區分大小寫): [說話者]你好 こんにちは Hello 另外,所有的語言選項都可以用'|'分割長段實現分句生成。 """, ) speaker = gr.Dropdown( choices=speakers, value=speakers[0], label="Speaker" ) _ = gr.Markdown( value="提示模式(Prompt mode):可選文字提示或音訊提示,用於產生文字或音訊指定風格的聲音。\n", visible=False, ) prompt_mode = gr.Radio( ["Text prompt", "Audio prompt"], label="Prompt Mode", value="Text prompt", visible=False, ) text_prompt = gr.Textbox( label="Text prompt", placeholder="用文字描述生成風格。如:Happy", value="Happy", visible=False, ) audio_prompt = gr.Audio( label="Audio prompt", type="filepath", visible=False ) sdp_ratio = gr.Slider( minimum=0, maximum=1, value=0.5, step=0.01, label="SDP Ratio" ) noise_scale = gr.Slider( minimum=0.1, maximum=2, value=0.5, step=0.01, label="Noise" ) noise_scale_w = gr.Slider( minimum=0.1, maximum=2, value=0.9, step=0.01, label="Noise_W" ) length_scale = gr.Slider( minimum=0.1, maximum=2, value=1.0, step=0.01, label="Length" ) language = gr.Dropdown( choices=languages, value=languages[0], label="Language" ) btn = gr.Button("點擊生成", variant="primary") with gr.Column(): with gr.Accordion("融合文本語義", open=False): gr.Markdown( value="使用輔助文本的語意來輔助生成對話(語言保持與主文本相同)\n\n" "**注意**:不要使用**指令式文字**(如:開心),要使用**帶有強烈情感的文本**(如:我好快樂!!!)\n\n" "效果較不明確,留空即為不使用該功能" ) style_text = gr.Textbox(label="輔助文本") style_weight = gr.Slider( minimum=0, maximum=1, value=0.7, step=0.1, label="Weight", info="主文本和輔助文本的bert混合比率,0表示僅主文本,1表示僅輔助文本", ) with gr.Row(): with gr.Column(): interval_between_sent = gr.Slider( minimum=0, maximum=5, value=0.2, step=0.1, label="句間停頓(秒),勾選按句切分才生效", ) interval_between_para = gr.Slider( minimum=0, maximum=10, value=1, step=0.1, label="段間停頓(秒),需要大於句間停頓才有效", ) opt_cut_by_sent = gr.Checkbox( label="按句切分 在按段落切分的基礎上再按句子切分文本" ) slicer = gr.Button("切分生成", variant="primary") text_output = gr.Textbox(label="狀態訊息") audio_output = gr.Audio(label="輸出音頻") # explain_image = gr.Image( # label="参数解释信息", # show_label=True, # show_share_button=False, # show_download_button=False, # value=os.path.abspath("./img/参数说明.png"), # ) btn.click( tts_fn, inputs=[ text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale, language, audio_prompt, text_prompt, prompt_mode, style_text, style_weight, ], outputs=[text_output, audio_output], api_name="api" ) slicer.click( tts_split, inputs=[ text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale, language, opt_cut_by_sent, interval_between_para, interval_between_sent, audio_prompt, text_prompt, style_text, style_weight, ], outputs=[text_output, audio_output], ) prompt_mode.change( lambda x: gr_util(x), inputs=[prompt_mode], outputs=[text_prompt, audio_prompt], ) audio_prompt.upload( lambda x: load_audio(x), inputs=[audio_prompt], outputs=[audio_prompt], ) app.launch(show_error=True)