Spaces:

Mahiruoshi
/

BangDream-Bert-VITS2

Running

App Files Files Community

BangDream-Bert-VITS2 / app.py

Mahiruoshi

Update app.py

0a949ac about 1 year ago

raw

history blame

20.5 kB

	# flake8: noqa: E402
	import logging
	logging.getLogger("numba").setLevel(logging.WARNING)
	logging.getLogger("markdown_it").setLevel(logging.WARNING)
	logging.getLogger("urllib3").setLevel(logging.WARNING)
	logging.getLogger("matplotlib").setLevel(logging.WARNING)

	logging.basicConfig(
	level=logging.INFO, format="\| %(name)s \| %(levelname)s \| %(message)s"
	)

	logger = logging.getLogger(__name__)
	import datetime
	import numpy as np
	import torch
	from ebooklib import epub
	import PyPDF2
	from PyPDF2 import PdfReader
	import zipfile
	import shutil
	import sys, os
	import json
	from bs4 import BeautifulSoup
	import argparse
	import commons
	import utils
	from models import SynthesizerTrn
	from text.symbols import symbols
	from text import cleaned_text_to_sequence, get_bert
	from text.cleaner import clean_text
	import gradio as gr
	import webbrowser
	import re
	from scipy.io.wavfile import write
	net_g = None
	BandList = {
	"PoppinParty":["香澄","有咲","たえ","りみ","沙綾"],
	"Afterglow":["蘭","モカ","ひまり","巴","つぐみ"],
	"HelloHappyWorld":["こころ","ミッシェル","薫","花音","はぐみ"],
	"PastelPalettes":["彩","日菜","千聖","イヴ","麻弥"],
	"Roselia":["友希那","紗夜","リサ","燐子","あこ"],
	"RaiseASuilen":["レイヤ","ロック","ますき","チュチュ","パレオ"],
	"Morfonica":["ましろ","瑠唯","つくし","七深","透子"],
	"MyGo&AveMujica(Part)":["燈","愛音","そよ","立希","楽奈","祥子","睦","海鈴"],
	}

	if sys.platform == "darwin" and torch.backends.mps.is_available():
	device = "mps"
	os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
	else:
	device = "cuda"

	def is_japanese(string):
	for ch in string:
	if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
	return True
	return False

	def extrac(text):
	text = re.sub("<[^>]*>","",text)
	result_list = re.split(r'\n', text)
	final_list = []
	for i in result_list:
	i = i.replace('\n','').replace(' ','')
	#Current length of single sentence: 20
	if len(i)>1:
	if len(i) > 20:
	try:
	cur_list = re.split(r'。\|！', i)
	for i in cur_list:
	if len(i)>1:
	final_list.append(i+'。')
	except:
	pass
	else:
	final_list.append(i)
	'''
	final_list.append(i)
	'''
	final_list = [x for x in final_list if x != '']
	return final_list

	def get_text(text, language_str, hps):
	norm_text, phone, tone, word2ph = clean_text(text, language_str)
	phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)

	if hps.data.add_blank:
	phone = commons.intersperse(phone, 0)
	tone = commons.intersperse(tone, 0)
	language = commons.intersperse(language, 0)
	for i in range(len(word2ph)):
	word2ph[i] = word2ph[i] * 2
	word2ph[0] += 1
	bert = get_bert(norm_text, word2ph, language_str, device)
	del word2ph
	assert bert.shape[-1] == len(phone), phone

	if language_str == "ZH":
	bert = bert
	ja_bert = torch.zeros(768, len(phone))
	elif language_str == "JA":
	ja_bert = bert
	bert = torch.zeros(1024, len(phone))
	else:
	bert = torch.zeros(1024, len(phone))
	ja_bert = torch.zeros(768, len(phone))

	assert bert.shape[-1] == len(
	phone
	), f"Bert seq len {bert.shape[-1]} != {len(phone)}"

	phone = torch.LongTensor(phone)
	tone = torch.LongTensor(tone)
	language = torch.LongTensor(language)
	return bert, ja_bert, phone, tone, language


	def infer(text, sdp_ratio, noise_scale, noise_scale_w, length_scale, sid, language):
	global net_g
	bert, ja_bert, phones, tones, lang_ids = get_text(text, language, hps)
	with torch.no_grad():
	x_tst = phones.to(device).unsqueeze(0)
	tones = tones.to(device).unsqueeze(0)
	lang_ids = lang_ids.to(device).unsqueeze(0)
	bert = bert.to(device).unsqueeze(0)
	ja_bert = ja_bert.to(device).unsqueeze(0)
	x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
	del phones
	speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
	audio = (
	net_g.infer(
	x_tst,
	x_tst_lengths,
	speakers,
	tones,
	lang_ids,
	bert,
	ja_bert,
	sdp_ratio=sdp_ratio,
	noise_scale=noise_scale,
	noise_scale_w=noise_scale_w,
	length_scale=length_scale,
	)[0][0, 0]
	.data.cpu()
	.float()
	.numpy()
	)
	del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers
	return audio


	def tts_fn(
	text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,LongSentence
	):
	if not LongSentence:
	with torch.no_grad():
	audio = infer(
	text,
	sdp_ratio=sdp_ratio,
	noise_scale=noise_scale,
	noise_scale_w=noise_scale_w,
	length_scale=length_scale,
	sid=speaker,
	language= "JP" if is_japanese(text) else "ZH",
	)
	torch.cuda.empty_cache()
	return (hps.data.sampling_rate, audio)
	else:
	audiopath = 'voice.wav'
	a = ['【','[','(','（']
	b = ['】',']',')','）']
	for i in a:
	text = text.replace(i,'<')
	for i in b:
	text = text.replace(i,'>')
	final_list = extrac(text.replace('“','').replace('”',''))
	audio_fin = []
	for sentence in final_list:
	with torch.no_grad():
	audio = infer(
	sentence,
	sdp_ratio=sdp_ratio,
	noise_scale=noise_scale,
	noise_scale_w=noise_scale_w,
	length_scale=length_scale,
	sid=speaker,
	language= "JP" if is_japanese(text) else "ZH",
	)
	audio_fin.append(audio)
	return (hps.data.sampling_rate, np.concatenate(audio_fin))

	def split_into_sentences(text):
	"""将文本分割为句子，基于中文的标点符号"""
	sentences = re.split(r'(?<=[。！？…\n])', text)
	return [sentence.strip() for sentence in sentences if sentence]


	def seconds_to_ass_time(seconds):
	"""将秒数转换为ASS时间格式"""
	hours = int(seconds / 3600)
	minutes = int((seconds % 3600) / 60)
	seconds = int(seconds) % 60
	milliseconds = int((seconds - int(seconds)) * 1000)
	return "{:01d}:{:02d}:{:02d}.{:02d}".format(hours, minutes, seconds, int(milliseconds / 10))

	def generate_audio_and_srt_for_group(group, outputPath, group_index, sampling_rate, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime):
	audio_fin = []
	ass_entries = []
	start_time = 0

	ass_header = """[Script Info]
	; Script generated by OpenAI Assistant
	Title: Audiobook
	ScriptType: v4.00+
	WrapStyle: 0
	PlayResX: 640
	PlayResY: 360
	ScaledBorderAndShadow: yes

	[V4+ Styles]
	Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
	Style: Default,Arial,20,&H00FFFFFF,&H000000FF,&H00000000,&H00000000,0,0,0,0,100,100,0,0,1,1,1,2,10,10,10,1

	[Events]
	Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
	"""

	for sentence in group:
	try:
	print(sentence)
	FakeSpeaker = sentence.split("\|")[0]
	print(FakeSpeaker)
	SpeakersList = re.split('\n', spealerList)
	if FakeSpeaker in list(hps.data.spk2id.keys()):
	speaker = FakeSpeaker
	for i in SpeakersList:
	if FakeSpeaker == i.split("\|")[1]:
	speaker = i.split("\|")[0]
	speaker_ids = hps.data.spk2id

	_, audio = tts_fn(sentence.split("\|")[-1], speaker=speaker, sdp_ratio=sdp_ratio, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale, LongSentence=True)
	silence_frames = int(silenceTime * 44010)
	silence_data = np.zeros((silence_frames,), dtype=audio.dtype)
	audio_fin.append(audio)
	audio_fin.append(silence_data)

	duration = len(audio) / sampling_rate
	end_time = start_time + duration + silenceTime
	ass_entries.append("Dialogue: 0,{},{},".format(seconds_to_ass_time(start_time), seconds_to_ass_time(end_time)) + "Default,,0,0,0,,{}".format(sentence.replace("\|","：")))
	start_time = end_time
	except:
	pass
	wav_filename = os.path.join(outputPath, f'audiobook_part_{group_index}.wav')
	ass_filename = os.path.join(outputPath, f'audiobook_part_{group_index}.ass')

	write(wav_filename, sampling_rate, np.concatenate(audio_fin))

	with open(ass_filename, 'w', encoding='utf-8') as f:
	f.write(ass_header + '\n'.join(ass_entries))
	return (hps.data.sampling_rate, np.concatenate(audio_fin))
	def extract_text_from_epub(file_path):
	book = epub.read_epub(file_path)
	content = []
	for item in book.items:
	if isinstance(item, epub.EpubHtml):
	soup = BeautifulSoup(item.content, 'html.parser')
	content.append(soup.get_text())
	return '\n'.join(content)

	def extract_text_from_pdf(file_path):
	with open(file_path, 'rb') as file:
	reader = PdfReader(file)
	content = [page.extract_text() for page in reader.pages]
	return '\n'.join(content)

	def extract_text_from_game(data):
	current_content = []

	def _extract(data, current_data=None):
	nonlocal current_content

	if current_data is None:
	current_data = {}

	if isinstance(data, dict):
	if 'windowDisplayName' in data:
	current_data['windowDisplayName'] = data['windowDisplayName']
	if 'body' in data:
	current_data['body'] = data['body'].replace('\n', '')
	if 'voiceId' in data:
	current_data['voiceId'] = data['voiceId']

	valid_data = all(current_data.get(k) for k in ['windowDisplayName', 'body', 'voiceId'])
	valid_displayname = "・" not in current_data.get('windowDisplayName', "")
	valid_body = bool(re.sub(r'[^\w]', '', current_data.get('body', "")))

	if valid_data and valid_displayname and valid_body:
	current_content.append(f"{current_data['windowDisplayName']}\|{current_data['body']}")

	for key in data:
	_extract(data[key], dict(current_data))

	elif isinstance(data, list):
	for item in data:
	_extract(item, dict(current_data))

	_extract(data)
	return '\n'.join(current_content)

	def extract_text_from_file(inputFile):
	file_extension = os.path.splitext(inputFile)[1].lower()

	if file_extension == ".epub":
	return extract_text_from_epub(inputFile)
	elif file_extension == ".pdf":
	return extract_text_from_pdf(inputFile)
	elif file_extension == ".txt":
	with open(inputFile, 'r', encoding='utf-8') as f:
	return f.read()
	elif file_extension == ".asset":
	with open(inputFile, 'r', encoding='utf-8') as f:
	content = json.load(f)
	return extract_text_from_game(content)
	else:
	raise ValueError(f"Unsupported file format: {file_extension}")

	def audiobook(inputFile, groupsize, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime):
	directory_path = "books"
	output_path = "books/audiobook_part_1.wav"

	if os.path.exists(directory_path):
	shutil.rmtree(directory_path)

	os.makedirs(directory_path)
	text = extract_text_from_file(inputFile.name)
	sentences = split_into_sentences(text)
	GROUP_SIZE = groupsize
	for i in range(0, len(sentences), GROUP_SIZE):
	group = sentences[i:i+GROUP_SIZE]
	if spealerList == "":
	spealerList = "无"
	result = generate_audio_and_srt_for_group(group,directory_path, i//GROUP_SIZE + 1, 44100, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime)
	if not torch.cuda.is_available():
	return result
	return result

	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument(
	"-m", "--model", default="./logs/BangDream/G_45000.pth", help="path of your model"
	)
	parser.add_argument(
	"-c",
	"--config",
	default="./logs/BangDream/config.json",
	help="path of your config file",
	)
	parser.add_argument(
	"--share", default=True, help="make link public", action="store_true"
	)
	parser.add_argument(
	"-d", "--debug", action="store_true", help="enable DEBUG-LEVEL log"
	)

	args = parser.parse_args()
	if args.debug:
	logger.info("Enable DEBUG-LEVEL log")
	logging.basicConfig(level=logging.DEBUG)
	hps = utils.get_hparams_from_file(args.config)

	device = (
	"cuda:0"
	if torch.cuda.is_available()
	else (
	"mps"
	if sys.platform == "darwin" and torch.backends.mps.is_available()
	else "cpu"
	)
	)
	net_g = SynthesizerTrn(
	len(symbols),
	hps.data.filter_length // 2 + 1,
	hps.train.segment_size // hps.data.hop_length,
	n_speakers=hps.data.n_speakers,
	**hps.model,
	).to(device)
	_ = net_g.eval()

	_ = utils.load_checkpoint(args.model, net_g, None, skip_optimizer=True)

	speaker_ids = hps.data.spk2id
	speakers = list(speaker_ids.keys())
	languages = ["ZH", "JP"]
	examples = [
	["filelist/Scenarioband6-018.asset", 500, "つくし", "ましろ\|真白\n七深\|七深\n透子\|透子\nつくし\|筑紫\n瑠唯\|瑠唯\nそよ\|素世\n祥子\|祥子", "扩展功能"],
	]
	with gr.Blocks() as app:
	gr.Markdown(
	'# Bang Dream全员TTS,使用本模型请严格遵守法律法规！\n发布二创作品请标注本项目作者及链接、作品使用Bert-VITS2 AI生成！'
	)
	for band in BandList:
	with gr.TabItem(band):
	for name in BandList[band]:
	with gr.TabItem(name):
	with gr.Row():
	with gr.Column():
	with gr.Row():
	gr.Markdown(
	'<div align="center">'
	f'<img style="width:auto;height:400px;" src="file/image/{name}.png">'
	'</div>'
	)
	LongSentence = gr.Checkbox(value=True, label="Generate LongSentence")
	with gr.Column():
	text = gr.TextArea(
	label="输入纯日语或者中文",
	placeholder="输入纯日语或者中文",
	value="純粋な日本語または中国語を入力してください。",
	)
	btn = gr.Button("点击生成", variant="primary")
	audio_output = gr.Audio(label="Output Audio")
	with gr.Accordion(label="TTS设定", open=False):
	sdp_ratio = gr.Slider(
	minimum=0, maximum=1, value=0.2, step=0.01, label="SDP/DP混合比"
	)
	noise_scale = gr.Slider(
	minimum=0.1, maximum=2, value=0.6, step=0.01, label="感情调节"
	)
	noise_scale_w = gr.Slider(
	minimum=0.1, maximum=2, value=0.8, step=0.01, label="音素长度"
	)
	length_scale = gr.Slider(
	minimum=0.1, maximum=2, value=1, step=0.01, label="生成长度"
	)
	speaker = gr.Dropdown(
	choices=speakers, value=name, label="说话人"
	)
	btn.click(
	tts_fn,
	inputs=[
	text,
	speaker,
	sdp_ratio,
	noise_scale,
	noise_scale_w,
	length_scale,
	LongSentence,
	],
	outputs=[audio_output],
	)
	for i in examples:
	with gr.Tab(i[-1]):
	with gr.Row():
	with gr.Column():
	gr.Markdown(
	f"从 <a href='filelists'>filelists文件夹</a> 下载示例\n游戏脚本见<a href='https://bestdori.com/tool/explorer/asset/cn/scenario'>bestdori</a>"
	)
	inputFile = gr.inputs.File(label="上传游戏脚本(日文)、中文脚本(需设置角色对应关系)、自制文、(需设置角色对应关系")
	groupSize = gr.Slider(
	minimum=10, maximum=1000,value = i[1], step=1, label="当个音频文件包含的最大字数"
	)
	silenceTime = gr.Slider(
	minimum=0, maximum=1, value=0.5, step=0.1, label="句子的间隔"
	)
	spealerList = gr.TextArea(
	label="角色对应表",
	placeholder="左边是你想要在每一句话合成中用到的speaker(见角色清单)右边是你上传文本时分隔符左边设置的说话人:{ChoseSpeakerFromConfigList1}\|{SeakerInUploadText1}\n{ChoseSpeakerFromConfigList2}\|{SeakerInUploadText2}\n{ChoseSpeakerFromConfigList3}\|{SeakerInUploadText3}\n",
	value = i[3],
	)
	speaker = gr.Dropdown(
	choices=speakers, value = i[2], label="角色清单"
	)
	with gr.Column():
	sdp_ratio = gr.Slider(
	minimum=0, maximum=1, value=0.2, step=0.01, label="SDP/DP混合比"
	)
	noise_scale = gr.Slider(
	minimum=0.1, maximum=2, value=0.6, step=0.01, label="感情调节"
	)
	noise_scale_w = gr.Slider(
	minimum=0.1, maximum=2, value=0.8, step=0.01, label="音素长度"
	)
	length_scale = gr.Slider(
	minimum=0.1, maximum=2, value=1, step=0.01, label="生成长度"
	)
	LastAudioOutput = gr.Audio(label="当用cuda在本地运行时才能在book文件夹下浏览全部合成内容")
	btn2 = gr.Button("点击生成", variant="primary")
	btn2.click(
	audiobook,
	inputs=[
	inputFile,
	groupSize,
	speaker,
	sdp_ratio,
	noise_scale,
	noise_scale_w,
	length_scale,
	spealerList,
	silenceTime
	],
	outputs=[LastAudioOutput],
	)
	app.launch()