# flake8: noqa: E402 import logging logging.getLogger("numba").setLevel(logging.WARNING) logging.getLogger("markdown_it").setLevel(logging.WARNING) logging.getLogger("urllib3").setLevel(logging.WARNING) logging.getLogger("matplotlib").setLevel(logging.WARNING) logging.basicConfig( level=logging.INFO, format="| %(name)s | %(levelname)s | %(message)s" ) logger = logging.getLogger(__name__) import datetime import numpy as np import torch from ebooklib import epub import PyPDF2 from PyPDF2 import PdfReader import zipfile import shutil import sys, os import json from bs4 import BeautifulSoup import argparse import commons import utils from models import SynthesizerTrn from text.symbols import symbols from text import cleaned_text_to_sequence, get_bert from text.cleaner import clean_text import gradio as gr import webbrowser import re from scipy.io.wavfile import write net_g = None BandList = { "PoppinParty":["香澄","有咲","たえ","りみ","沙綾"], "Afterglow":["蘭","モカ","ひまり","巴","つぐみ"], "HelloHappyWorld":["こころ","ミッシェル","薫","花音","はぐみ"], "PastelPalettes":["彩","日菜","千聖","イヴ","麻弥"], "Roselia":["友希那","紗夜","リサ","燐子","あこ"], "RaiseASuilen":["レイヤ","ロック","ますき","チュチュ","パレオ"], "Morfonica":["ましろ","瑠唯","つくし","七深","透子"], "MyGo&AveMujica(Part)":["燈","愛音","そよ","立希","楽奈","祥子","睦","海鈴"], } if sys.platform == "darwin" and torch.backends.mps.is_available(): device = "mps" os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" else: device = "cuda" def is_japanese(string): for ch in string: if ord(ch) > 0x3040 and ord(ch) < 0x30FF: return True return False def extrac(text): text = re.sub("<[^>]*>","",text) result_list = re.split(r'\n', text) final_list = [] for i in result_list: i = i.replace('\n','').replace(' ','') #Current length of single sentence: 20 if len(i)>1: if len(i) > 20: try: cur_list = re.split(r'。|!', i) for i in cur_list: if len(i)>1: final_list.append(i+'。') except: pass else: final_list.append(i) ''' final_list.append(i) ''' final_list = [x for x in final_list if x != ''] return final_list def get_text(text, language_str, hps): norm_text, phone, tone, word2ph = clean_text(text, language_str) phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str) if hps.data.add_blank: phone = commons.intersperse(phone, 0) tone = commons.intersperse(tone, 0) language = commons.intersperse(language, 0) for i in range(len(word2ph)): word2ph[i] = word2ph[i] * 2 word2ph[0] += 1 bert = get_bert(norm_text, word2ph, language_str, device) del word2ph assert bert.shape[-1] == len(phone), phone if language_str == "ZH": bert = bert ja_bert = torch.zeros(768, len(phone)) elif language_str == "JA": ja_bert = bert bert = torch.zeros(1024, len(phone)) else: bert = torch.zeros(1024, len(phone)) ja_bert = torch.zeros(768, len(phone)) assert bert.shape[-1] == len( phone ), f"Bert seq len {bert.shape[-1]} != {len(phone)}" phone = torch.LongTensor(phone) tone = torch.LongTensor(tone) language = torch.LongTensor(language) return bert, ja_bert, phone, tone, language def infer(text, sdp_ratio, noise_scale, noise_scale_w, length_scale, sid, language): global net_g bert, ja_bert, phones, tones, lang_ids = get_text(text, language, hps) with torch.no_grad(): x_tst = phones.to(device).unsqueeze(0) tones = tones.to(device).unsqueeze(0) lang_ids = lang_ids.to(device).unsqueeze(0) bert = bert.to(device).unsqueeze(0) ja_bert = ja_bert.to(device).unsqueeze(0) x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device) del phones speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device) audio = ( net_g.infer( x_tst, x_tst_lengths, speakers, tones, lang_ids, bert, ja_bert, sdp_ratio=sdp_ratio, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale, )[0][0, 0] .data.cpu() .float() .numpy() ) del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers return audio def tts_fn( text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,LongSentence ): if not LongSentence: with torch.no_grad(): audio = infer( text, sdp_ratio=sdp_ratio, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale, sid=speaker, language= "JP" if is_japanese(text) else "ZH", ) torch.cuda.empty_cache() return (hps.data.sampling_rate, audio) else: audiopath = 'voice.wav' a = ['【','[','(','('] b = ['】',']',')',')'] for i in a: text = text.replace(i,'<') for i in b: text = text.replace(i,'>') final_list = extrac(text.replace('“','').replace('”','')) audio_fin = [] for sentence in final_list: with torch.no_grad(): audio = infer( sentence, sdp_ratio=sdp_ratio, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale, sid=speaker, language= "JP" if is_japanese(text) else "ZH", ) audio_fin.append(audio) return (hps.data.sampling_rate, np.concatenate(audio_fin)) def split_into_sentences(text): """将文本分割为句子,基于中文的标点符号""" sentences = re.split(r'(?<=[。!?…\n])', text) return [sentence.strip() for sentence in sentences if sentence] def seconds_to_ass_time(seconds): """将秒数转换为ASS时间格式""" hours = int(seconds / 3600) minutes = int((seconds % 3600) / 60) seconds = int(seconds) % 60 milliseconds = int((seconds - int(seconds)) * 1000) return "{:01d}:{:02d}:{:02d}.{:02d}".format(hours, minutes, seconds, int(milliseconds / 10)) def generate_audio_and_srt_for_group(group, outputPath, group_index, sampling_rate, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime): audio_fin = [] ass_entries = [] start_time = 0 ass_header = """[Script Info] ; Script generated by OpenAI Assistant Title: Audiobook ScriptType: v4.00+ WrapStyle: 0 PlayResX: 640 PlayResY: 360 ScaledBorderAndShadow: yes [V4+ Styles] Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding Style: Default,Arial,20,&H00FFFFFF,&H000000FF,&H00000000,&H00000000,0,0,0,0,100,100,0,0,1,1,1,2,10,10,10,1 [Events] Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text """ for sentence in group: try: print(sentence) FakeSpeaker = sentence.split("|")[0] print(FakeSpeaker) SpeakersList = re.split('\n', spealerList) if FakeSpeaker in list(hps.data.spk2id.keys()): speaker = FakeSpeaker for i in SpeakersList: if FakeSpeaker == i.split("|")[1]: speaker = i.split("|")[0] speaker_ids = hps.data.spk2id _, audio = tts_fn(sentence.split("|")[-1], speaker=speaker, sdp_ratio=sdp_ratio, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale, LongSentence=True) silence_frames = int(silenceTime * 44010) silence_data = np.zeros((silence_frames,), dtype=audio.dtype) audio_fin.append(audio) audio_fin.append(silence_data) duration = len(audio) / sampling_rate end_time = start_time + duration + silenceTime ass_entries.append("Dialogue: 0,{},{},".format(seconds_to_ass_time(start_time), seconds_to_ass_time(end_time)) + "Default,,0,0,0,,{}".format(sentence.replace("|",":"))) start_time = end_time except: pass wav_filename = os.path.join(outputPath, f'audiobook_part_{group_index}.wav') ass_filename = os.path.join(outputPath, f'audiobook_part_{group_index}.ass') write(wav_filename, sampling_rate, np.concatenate(audio_fin)) with open(ass_filename, 'w', encoding='utf-8') as f: f.write(ass_header + '\n'.join(ass_entries)) return (hps.data.sampling_rate, np.concatenate(audio_fin)) def extract_text_from_epub(file_path): book = epub.read_epub(file_path) content = [] for item in book.items: if isinstance(item, epub.EpubHtml): soup = BeautifulSoup(item.content, 'html.parser') content.append(soup.get_text()) return '\n'.join(content) def extract_text_from_pdf(file_path): with open(file_path, 'rb') as file: reader = PdfReader(file) content = [page.extract_text() for page in reader.pages] return '\n'.join(content) def extract_text_from_game2(data): current_content = [] def _extract(data, current_data=None): nonlocal current_content if current_data is None: current_data = {} if isinstance(data, dict): if 'name' in data and 'body' in data: current_name = data['name'] current_body = data['body'].replace('\n', '') current_content.append(f"{current_name}|{current_body}") for key, value in data.items(): _extract(value, dict(current_data)) elif isinstance(data, list): for item in data: _extract(item, dict(current_data)) _extract(data) return '\n'.join(current_content) def extract_text_from_file(inputFile): file_extension = os.path.splitext(inputFile)[1].lower() if file_extension == ".epub": return extract_text_from_epub(inputFile) elif file_extension == ".pdf": return extract_text_from_pdf(inputFile) elif file_extension == ".txt": with open(inputFile, 'r', encoding='utf-8') as f: return f.read() elif file_extension == ".asset": with open(inputFile, 'r', encoding='utf-8') as f: content = json.load(f) return extract_text_from_game2(content) if extract_text_from_game(content) != '' else extract_text_from_game(content) else: raise ValueError(f"Unsupported file format: {file_extension}") def audiobook(inputFile, groupsize, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime): directory_path = "books" output_path = "books/audiobook_part_1.wav" if os.path.exists(directory_path): shutil.rmtree(directory_path) os.makedirs(directory_path) text = extract_text_from_file(inputFile.name) sentences = split_into_sentences(text) GROUP_SIZE = groupsize for i in range(0, len(sentences), GROUP_SIZE): group = sentences[i:i+GROUP_SIZE] if spealerList == "": spealerList = "无" result = generate_audio_and_srt_for_group(group,directory_path, i//GROUP_SIZE + 1, 44100, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime) if not torch.cuda.is_available(): return result return result if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "-m", "--model", default="./logs/BangDream/G_43000.pth", help="path of your model" ) parser.add_argument( "-c", "--config", default="./logs/BangDream/config.json", help="path of your config file", ) parser.add_argument( "--share", default=True, help="make link public", action="store_true" ) parser.add_argument( "-d", "--debug", action="store_true", help="enable DEBUG-LEVEL log" ) args = parser.parse_args() if args.debug: logger.info("Enable DEBUG-LEVEL log") logging.basicConfig(level=logging.DEBUG) hps = utils.get_hparams_from_file(args.config) device = ( "cuda:0" if torch.cuda.is_available() else ( "mps" if sys.platform == "darwin" and torch.backends.mps.is_available() else "cpu" ) ) net_g = SynthesizerTrn( len(symbols), hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, n_speakers=hps.data.n_speakers, **hps.model, ).to(device) _ = net_g.eval() _ = utils.load_checkpoint(args.model, net_g, None, skip_optimizer=True) speaker_ids = hps.data.spk2id speakers = list(speaker_ids.keys()) languages = ["ZH", "JP"] examples = [ ["filelist/Scenarioband6-018.asset", 500, "つくし", "ましろ|真白\n七深|七深\n透子|透子\nつくし|筑紫\n瑠唯|瑠唯\nそよ|素世\n祥子|祥子", "扩展功能"], ] with gr.Blocks() as app: gr.Markdown( '# Bang Dream全员TTS,使用本模型请严格遵守法律法规!\n发布二创作品请标注本项目作者及链接、作品使用Bert-VITS2 AI生成!\n 使用以及本地部署指南 ' ) for band in BandList: with gr.TabItem(band): for name in BandList[band]: with gr.TabItem(name): with gr.Row(): with gr.Column(): with gr.Row(): gr.Markdown( '