Hololive-Style-Bert-VITS2

Running

App Files Files Community

Hololive-Style-Bert-VITS2 / app.py

Kit-Lemonfoot

Upload app.py

b5bac24 verified about 1 year ago

raw

history blame

17.4 kB

	import argparse
	import datetime
	import os
	import sys
	import warnings

	import gradio as gr
	import numpy as np
	import torch
	from gradio.processing_utils import convert_to_16_bit_wav

	import utils
	from config import config
	from infer import get_net_g, infer
	from tools.log import logger

	is_hf_spaces = os.getenv("SYSTEM") == "spaces"
	limit = 150


	class Model:
	def __init__(self, model_path, config_path, style_vec_path, device):
	self.model_path = model_path
	self.config_path = config_path
	self.device = device
	self.style_vec_path = style_vec_path
	self.load()

	def load(self):
	self.hps = utils.get_hparams_from_file(self.config_path)
	self.spk2id = self.hps.data.spk2id
	self.num_styles = self.hps.data.num_styles
	if hasattr(self.hps.data, "style2id"):
	self.style2id = self.hps.data.style2id
	else:
	self.style2id = {str(i): i for i in range(self.num_styles)}

	self.style_vectors = np.load(self.style_vec_path)
	self.net_g = None

	def load_net_g(self):
	self.net_g = get_net_g(
	model_path=self.model_path,
	version=self.hps.version,
	device=self.device,
	hps=self.hps,
	)

	def get_style_vector(self, style_id, weight=1.0):
	mean = self.style_vectors[0]
	style_vec = self.style_vectors[style_id]
	style_vec = mean + (style_vec - mean) * weight
	return style_vec

	def get_style_vector_from_audio(self, audio_path, weight=1.0):
	from style_gen import extract_style_vector

	xvec = extract_style_vector(audio_path)
	mean = self.style_vectors[0]
	xvec = mean + (xvec - mean) * weight
	return xvec

	def infer(
	self,
	text,
	language="JP",
	sid=0,
	reference_audio_path=None,
	sdp_ratio=0.2,
	noise=0.6,
	noisew=0.8,
	length=1.0,
	line_split=True,
	split_interval=0.2,
	style_text="",
	style_weight=0.7,
	use_style_text=False,
	style="0",
	emotion_weight=1.0,
	):
	if reference_audio_path == "":
	reference_audio_path = None
	if style_text == "" or not use_style_text:
	style_text = None

	if self.net_g is None:
	self.load_net_g()
	if reference_audio_path is None:
	style_id = self.style2id[style]
	style_vector = self.get_style_vector(style_id, emotion_weight)
	else:
	style_vector = self.get_style_vector_from_audio(
	reference_audio_path, emotion_weight
	)
	if not line_split:
	with torch.no_grad():
	audio = infer(
	text=text,
	sdp_ratio=sdp_ratio,
	noise_scale=noise,
	noise_scale_w=noisew,
	length_scale=length,
	sid=sid,
	language=language,
	hps=self.hps,
	net_g=self.net_g,
	device=self.device,
	style_text=style_text,
	style_weight=style_weight,
	style_vec=style_vector,
	)
	else:
	texts = text.split("\n")
	texts = [t for t in texts if t != ""]
	audios = []
	with torch.no_grad():
	for i, t in enumerate(texts):
	audios.append(
	infer(
	text=t,
	sdp_ratio=sdp_ratio,
	noise_scale=noise,
	noise_scale_w=noisew,
	length_scale=length,
	sid=sid,
	language=language,
	hps=self.hps,
	net_g=self.net_g,
	device=self.device,
	style_text=style_text,
	style_weight=style_weight,
	style_vec=style_vector,
	)
	)
	if i != len(texts) - 1:
	audios.append(np.zeros(int(44100 * split_interval)))
	audio = np.concatenate(audios)
	with warnings.catch_warnings():
	warnings.simplefilter("ignore")
	audio = convert_to_16_bit_wav(audio)
	return (self.hps.data.sampling_rate, audio)


	class ModelHolder:
	def __init__(self, root_dir, device):
	self.root_dir = root_dir
	self.device = device
	self.model_files_dict = {}
	self.current_model = None
	self.model_names = []
	self.models = []
	self.refresh()

	def refresh(self):
	self.model_files_dict = {}
	self.model_names = []
	self.current_model = None
	model_dirs = [
	d
	for d in os.listdir(self.root_dir)
	if os.path.isdir(os.path.join(self.root_dir, d))
	]
	for model_name in model_dirs:
	model_dir = os.path.join(self.root_dir, model_name)
	model_files = [
	os.path.join(model_dir, f)
	for f in os.listdir(model_dir)
	if f.endswith(".pth") or f.endswith(".pt") or f.endswith(".safetensors")
	]
	if len(model_files) == 0:
	logger.info(
	f"No model files found in {self.root_dir}/{model_name}, so skip it"
	)
	self.model_files_dict[model_name] = model_files
	self.model_names.append(model_name)

	def load_model(self, model_name, model_path):
	if model_name not in self.model_files_dict:
	raise Exception(f"モデル名{model_name}は存在しません")
	if model_path not in self.model_files_dict[model_name]:
	raise Exception(f"pthファイル{model_path}は存在しません")
	self.current_model = Model(
	model_path=model_path,
	config_path=os.path.join(self.root_dir, model_name, "config.json"),
	style_vec_path=os.path.join(self.root_dir, model_name, "style_vectors.npy"),
	device=self.device,
	)
	styles = list(self.current_model.style2id.keys())
	speakers = list(self.current_model.spk2id.keys())
	return (
	gr.Dropdown(choices=styles, value=styles[0]),
	gr.update(interactive=True, value="Synthesize"),
	gr.Dropdown(choices=speakers, value=speakers[0]),
	)

	def update_model_files_dropdown(self, model_name):
	model_files = self.model_files_dict[model_name]
	return gr.Dropdown(choices=model_files, value=model_files[0])

	def update_model_names_dropdown(self):
	self.refresh()
	initial_model_name = self.model_names[0]
	initial_model_files = self.model_files_dict[initial_model_name]
	return (
	gr.Dropdown(choices=self.model_names, value=initial_model_name),
	gr.Dropdown(choices=initial_model_files, value=initial_model_files[0]),
	gr.update(interactive=False), # For tts_button
	)


	def tts_fn(
	model_name,
	model_path,
	text,
	language,
	reference_audio_path,
	sdp_ratio,
	noise_scale,
	noise_scale_w,
	length_scale,
	line_split,
	split_interval,
	style_text,
	style_weight,
	use_style_text,
	emotion,
	emotion_weight,
	speaker,
	):
	if not text:
	return "Please enter some text.", (44100, None)
	#logger.info(f"Start TTS with {language}:\n{text}")
	#logger.info(f"Model: {model_holder.current_model.model_path}")
	#logger.info(f"SDP: {sdp_ratio}, Noise: {noise_scale}, Noise_W: {noise_scale_w}, Length: {length_scale}")
	#logger.info(f"Style text enabled: {use_style_text}, Style text: {style_text}, Style weight: {style_weight}")
	#logger.info(f"Style: {emotion}, Style weight: {emotion_weight}")

	if is_hf_spaces and len(text) > limit:
	return f"Too long! There is a character limit of {limit} characters.", (44100, None)

	assert model_holder.current_model is not None

	if(model_holder.current_model.model_path != model_path):
	model_holder.load_model(model_name, model_path)

	speaker_id = model_holder.current_model.spk2id[speaker]

	start_time = datetime.datetime.now()

	sr, audio = model_holder.current_model.infer(
	text=text,
	language=language,
	sid=speaker_id,
	reference_audio_path=reference_audio_path,
	sdp_ratio=sdp_ratio,
	noise=noise_scale,
	noisew=noise_scale_w,
	length=length_scale,
	line_split=line_split,
	split_interval=split_interval,
	style_text=style_text,
	style_weight=style_weight,
	use_style_text=use_style_text,
	style=emotion,
	emotion_weight=emotion_weight,
	)

	end_time = datetime.datetime.now()
	duration = (end_time - start_time).total_seconds()
	logger.info(f"Successful inference, took {duration}s \| {speaker} \| {sdp_ratio}/{noise_scale}/{noise_scale_w}/{length_scale} \| {text}")
	return f"Success, time: {duration} seconds.", (sr, audio)


	initial_text = "Hi there! How are you doing?"

	initial_md = """
	# LemonfootSBV2 😊🍋
	### Space by [Kit Lemonfoot](https://huggingface.co/Kit-Lemonfoot) / [Noel Shirogane's High Flying Birds](https://www.youtube.com/channel/UCG9A0OJsJTluLOXfMZjJ9xA)
	### Based on code originally by [fishaudio](https://github.com/fishaudio) and [litagin02](https://github.com/litagin02)
	This HuggingFace space is designed to demonstrate multiple experimental [Style-Bert-VITS2](https://github.com/litagin02/Style-Bert-VITS2) models made by Kit Lemonfoot.

	Do no evil.

	"""

	style_md = """
	- You can control things like voice tone, emotion, and reading style through presets or through voice files.
	- Neutral acts as an average across all speakers. Styling options act as an override to Neutral.
	- Setting the intensity too high will likely break the output.
	- The required intensity will depend based on the speaker and the desired style.
	- If you're using preexisting audio data to style the output, try to use a voice that is similar to the desired speaker.
	"""


	def make_interactive():
	return gr.update(interactive=True, value="Synthesize")


	def make_non_interactive():
	return gr.update(interactive=False, value="Synthesize (Please load a model!)")


	def gr_util(item):
	if item == "Select from presets":
	return (gr.update(visible=True), gr.Audio(visible=False, value=None))
	else:
	return (gr.update(visible=False), gr.update(visible=True))


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("--cpu", action="store_true", help="Use CPU instead of GPU")
	parser.add_argument(
	"--dir", "-d", type=str, help="Model directory", default=config.out_dir
	)
	args = parser.parse_args()
	model_dir = args.dir

	if args.cpu:
	device = "cpu"
	else:
	device = "cuda" if torch.cuda.is_available() else "cpu"

	model_holder = ModelHolder(model_dir, device)

	languages = ["EN", "JP", "ZH"]

	model_names = model_holder.model_names
	if len(model_names) == 0:
	logger.error(f"No models found. Please place the model in {model_dir}.")
	sys.exit(1)
	initial_id = 0
	initial_pth_files = model_holder.model_files_dict[model_names[initial_id]]

	with gr.Blocks(theme=gr.themes.Base(primary_hue="emerald", secondary_hue="green"), title="LemonfootSBV2") as app:
	gr.Markdown(initial_md)
	with gr.Row():
	with gr.Column():
	with gr.Row():
	with gr.Column(scale=3):
	model_name = gr.Dropdown(
	label="Available Models",
	choices=model_names,
	value=model_names[initial_id],
	)
	model_path = gr.Dropdown(
	label="Model File",
	choices=initial_pth_files,
	value=initial_pth_files[0],
	)
	refresh_button = gr.Button("Refresh", scale=1, visible=not is_hf_spaces)
	load_button = gr.Button("Load", scale=1, variant="primary")
	text_input = gr.TextArea(label="Text", value=initial_text)

	line_split = gr.Checkbox(label="Divide text seperately by line breaks", value=True)
	split_interval = gr.Slider(
	minimum=0.0,
	maximum=2,
	value=0.5,
	step=0.1,
	label="Length of division seperation time (in seconds)",
	)
	language = gr.Dropdown(choices=languages, value="EN", label="Language")
	speaker = gr.Dropdown(label="Speaker")
	with gr.Accordion(label="Advanced Settings", open=False):
	sdp_ratio = gr.Slider(
	minimum=0, maximum=1, value=0.2, step=0.1, label="SDP Ratio"
	)
	noise_scale = gr.Slider(
	minimum=0.1, maximum=2, value=0.6, step=0.1, label="Noise"
	)
	noise_scale_w = gr.Slider(
	minimum=0.1, maximum=2, value=0.8, step=0.1, label="Noise_W"
	)
	length_scale = gr.Slider(
	minimum=0.1, maximum=2, value=1.0, step=0.1, label="Length"
	)
	use_style_text = gr.Checkbox(label="Use stylization text", value=False)
	style_text = gr.Textbox(
	label="Style text",
	placeholder="Why are you ignoring me? You're unforgivable and disgusting! I hope you die.",
	info="The voice will be similar in tone and emotion to the text, however inflection and tempo may be worse as a result.",
	visible=False,
	)
	style_text_weight = gr.Slider(
	minimum=0,
	maximum=1,
	value=0.7,
	step=0.1,
	label="Text stylization strength",
	visible=False,
	)
	use_style_text.change(
	lambda x: (gr.Textbox(visible=x), gr.Slider(visible=x)),
	inputs=[use_style_text],
	outputs=[style_text, style_text_weight],
	)
	with gr.Column():
	with gr.Accordion("Styling Guide", open=False):
	gr.Markdown(style_md)
	style_mode = gr.Radio(
	["Select from presets", "Use an audio file"],
	label="Style Specification",
	value="Select from presets",
	)
	style = gr.Dropdown(
	label="Current style (Neutral is an average style)",
	choices=["Please load a model first!"],
	value="Please load a model first!",
	)
	style_weight = gr.Slider(
	minimum=0,
	maximum=50,
	value=5,
	step=0.1,
	label="Style strength",
	)
	ref_audio_path = gr.Audio(label="Reference Audio", type="filepath", visible=False)
	tts_button = gr.Button(
	"Synthesize (Please load a model!)", variant="primary", interactive=False
	)
	text_output = gr.Textbox(label="Info")
	audio_output = gr.Audio(label="Result")

	tts_button.click(
	tts_fn,
	inputs=[
	model_name,
	model_path,
	text_input,
	language,
	ref_audio_path,
	sdp_ratio,
	noise_scale,
	noise_scale_w,
	length_scale,
	line_split,
	split_interval,
	style_text,
	style_text_weight,
	use_style_text,
	style,
	style_weight,
	speaker,
	],
	outputs=[text_output, audio_output],
	)

	model_name.change(
	model_holder.update_model_files_dropdown,
	inputs=[model_name],
	outputs=[model_path],
	)

	model_path.change(make_non_interactive, outputs=[tts_button])

	refresh_button.click(
	model_holder.update_model_names_dropdown,
	outputs=[model_name, model_path, tts_button],
	)

	load_button.click(
	model_holder.load_model,
	inputs=[model_name, model_path],
	outputs=[style, tts_button, speaker],
	)

	style_mode.change(
	gr_util,
	inputs=[style_mode],
	outputs=[style, ref_audio_path],
	)

	app.launch(inbrowser=True)