|
import argparse |
|
import datetime |
|
import os |
|
import sys |
|
import warnings |
|
|
|
import gradio as gr |
|
import numpy as np |
|
import torch |
|
from gradio.processing_utils import convert_to_16_bit_wav |
|
|
|
import utils |
|
from config import config |
|
from infer import get_net_g, infer |
|
from tools.log import logger |
|
|
|
is_hf_spaces = os.getenv("SYSTEM") == "spaces" |
|
limit = 150 |
|
|
|
|
|
class Model: |
|
def __init__(self, model_path, config_path, style_vec_path, device): |
|
self.model_path = model_path |
|
self.config_path = config_path |
|
self.device = device |
|
self.style_vec_path = style_vec_path |
|
self.load() |
|
|
|
def load(self): |
|
self.hps = utils.get_hparams_from_file(self.config_path) |
|
self.spk2id = self.hps.data.spk2id |
|
self.num_styles = self.hps.data.num_styles |
|
if hasattr(self.hps.data, "style2id"): |
|
self.style2id = self.hps.data.style2id |
|
else: |
|
self.style2id = {str(i): i for i in range(self.num_styles)} |
|
|
|
self.style_vectors = np.load(self.style_vec_path) |
|
self.net_g = None |
|
|
|
def load_net_g(self): |
|
self.net_g = get_net_g( |
|
model_path=self.model_path, |
|
version=self.hps.version, |
|
device=self.device, |
|
hps=self.hps, |
|
) |
|
|
|
def get_style_vector(self, style_id, weight=1.0): |
|
mean = self.style_vectors[0] |
|
style_vec = self.style_vectors[style_id] |
|
style_vec = mean + (style_vec - mean) * weight |
|
return style_vec |
|
|
|
def get_style_vector_from_audio(self, audio_path, weight=1.0): |
|
from style_gen import extract_style_vector |
|
|
|
xvec = extract_style_vector(audio_path) |
|
mean = self.style_vectors[0] |
|
xvec = mean + (xvec - mean) * weight |
|
return xvec |
|
|
|
def infer( |
|
self, |
|
text, |
|
language="JP", |
|
sid=0, |
|
reference_audio_path=None, |
|
sdp_ratio=0.2, |
|
noise=0.6, |
|
noisew=0.8, |
|
length=1.0, |
|
line_split=True, |
|
split_interval=0.2, |
|
style_text="", |
|
style_weight=0.7, |
|
use_style_text=False, |
|
style="0", |
|
emotion_weight=1.0, |
|
): |
|
if reference_audio_path == "": |
|
reference_audio_path = None |
|
if style_text == "" or not use_style_text: |
|
style_text = None |
|
|
|
if self.net_g is None: |
|
self.load_net_g() |
|
if reference_audio_path is None: |
|
style_id = self.style2id[style] |
|
style_vector = self.get_style_vector(style_id, emotion_weight) |
|
else: |
|
style_vector = self.get_style_vector_from_audio( |
|
reference_audio_path, emotion_weight |
|
) |
|
if not line_split: |
|
with torch.no_grad(): |
|
audio = infer( |
|
text=text, |
|
sdp_ratio=sdp_ratio, |
|
noise_scale=noise, |
|
noise_scale_w=noisew, |
|
length_scale=length, |
|
sid=sid, |
|
language=language, |
|
hps=self.hps, |
|
net_g=self.net_g, |
|
device=self.device, |
|
style_text=style_text, |
|
style_weight=style_weight, |
|
style_vec=style_vector, |
|
) |
|
else: |
|
texts = text.split("\n") |
|
texts = [t for t in texts if t != ""] |
|
audios = [] |
|
with torch.no_grad(): |
|
for i, t in enumerate(texts): |
|
audios.append( |
|
infer( |
|
text=t, |
|
sdp_ratio=sdp_ratio, |
|
noise_scale=noise, |
|
noise_scale_w=noisew, |
|
length_scale=length, |
|
sid=sid, |
|
language=language, |
|
hps=self.hps, |
|
net_g=self.net_g, |
|
device=self.device, |
|
style_text=style_text, |
|
style_weight=style_weight, |
|
style_vec=style_vector, |
|
) |
|
) |
|
if i != len(texts) - 1: |
|
audios.append(np.zeros(int(44100 * split_interval))) |
|
audio = np.concatenate(audios) |
|
with warnings.catch_warnings(): |
|
warnings.simplefilter("ignore") |
|
audio = convert_to_16_bit_wav(audio) |
|
return (self.hps.data.sampling_rate, audio) |
|
|
|
|
|
class ModelHolder: |
|
def __init__(self, root_dir, device): |
|
self.root_dir = root_dir |
|
self.device = device |
|
self.model_files_dict = {} |
|
self.current_model = None |
|
self.model_names = [] |
|
self.models = [] |
|
self.refresh() |
|
|
|
def refresh(self): |
|
self.model_files_dict = {} |
|
self.model_names = [] |
|
self.current_model = None |
|
model_dirs = [ |
|
d |
|
for d in os.listdir(self.root_dir) |
|
if os.path.isdir(os.path.join(self.root_dir, d)) |
|
] |
|
for model_name in model_dirs: |
|
model_dir = os.path.join(self.root_dir, model_name) |
|
model_files = [ |
|
os.path.join(model_dir, f) |
|
for f in os.listdir(model_dir) |
|
if f.endswith(".pth") or f.endswith(".pt") or f.endswith(".safetensors") |
|
] |
|
if len(model_files) == 0: |
|
logger.info( |
|
f"No model files found in {self.root_dir}/{model_name}, so skip it" |
|
) |
|
self.model_files_dict[model_name] = model_files |
|
self.model_names.append(model_name) |
|
|
|
def load_model(self, model_name, model_path): |
|
if model_name not in self.model_files_dict: |
|
raise Exception(f"モデル名{model_name}は存在しません") |
|
if model_path not in self.model_files_dict[model_name]: |
|
raise Exception(f"pthファイル{model_path}は存在しません") |
|
self.current_model = Model( |
|
model_path=model_path, |
|
config_path=os.path.join(self.root_dir, model_name, "config.json"), |
|
style_vec_path=os.path.join(self.root_dir, model_name, "style_vectors.npy"), |
|
device=self.device, |
|
) |
|
styles = list(self.current_model.style2id.keys()) |
|
speakers = list(self.current_model.spk2id.keys()) |
|
return ( |
|
gr.Dropdown(choices=styles, value=styles[0]), |
|
gr.update(interactive=True, value="Synthesize"), |
|
gr.Dropdown(choices=speakers, value=speakers[0]), |
|
) |
|
|
|
def update_model_files_dropdown(self, model_name): |
|
model_files = self.model_files_dict[model_name] |
|
return gr.Dropdown(choices=model_files, value=model_files[0]) |
|
|
|
def update_model_names_dropdown(self): |
|
self.refresh() |
|
initial_model_name = self.model_names[0] |
|
initial_model_files = self.model_files_dict[initial_model_name] |
|
return ( |
|
gr.Dropdown(choices=self.model_names, value=initial_model_name), |
|
gr.Dropdown(choices=initial_model_files, value=initial_model_files[0]), |
|
gr.update(interactive=False), |
|
) |
|
|
|
|
|
def tts_fn( |
|
model_name, |
|
model_path, |
|
text, |
|
language, |
|
reference_audio_path, |
|
sdp_ratio, |
|
noise_scale, |
|
noise_scale_w, |
|
length_scale, |
|
line_split, |
|
split_interval, |
|
style_text, |
|
style_weight, |
|
use_style_text, |
|
emotion, |
|
emotion_weight, |
|
speaker, |
|
): |
|
if not text: |
|
return "Please enter some text.", (44100, None) |
|
|
|
|
|
|
|
|
|
|
|
|
|
if is_hf_spaces and len(text) > limit: |
|
return f"Too long! There is a character limit of {limit} characters.", (44100, None) |
|
|
|
assert model_holder.current_model is not None |
|
|
|
if(model_holder.current_model.model_path != model_path): |
|
model_holder.load_model(model_name, model_path) |
|
|
|
speaker_id = model_holder.current_model.spk2id[speaker] |
|
|
|
start_time = datetime.datetime.now() |
|
|
|
sr, audio = model_holder.current_model.infer( |
|
text=text, |
|
language=language, |
|
sid=speaker_id, |
|
reference_audio_path=reference_audio_path, |
|
sdp_ratio=sdp_ratio, |
|
noise=noise_scale, |
|
noisew=noise_scale_w, |
|
length=length_scale, |
|
line_split=line_split, |
|
split_interval=split_interval, |
|
style_text=style_text, |
|
style_weight=style_weight, |
|
use_style_text=use_style_text, |
|
style=emotion, |
|
emotion_weight=emotion_weight, |
|
) |
|
|
|
end_time = datetime.datetime.now() |
|
duration = (end_time - start_time).total_seconds() |
|
logger.info(f"Successful inference, took {duration}s | {speaker} | {sdp_ratio}/{noise_scale}/{noise_scale_w}/{length_scale} | {text}") |
|
return f"Success, time: {duration} seconds.", (sr, audio) |
|
|
|
|
|
initial_text = "Hi there! How are you doing?" |
|
|
|
initial_md = """ |
|
# LemonfootSBV2 😊🍋 |
|
### Space by [Kit Lemonfoot](https://huggingface.co/Kit-Lemonfoot) / [Noel Shirogane's High Flying Birds](https://www.youtube.com/channel/UCG9A0OJsJTluLOXfMZjJ9xA) |
|
### Based on code originally by [fishaudio](https://github.com/fishaudio) and [litagin02](https://github.com/litagin02) |
|
This HuggingFace space is designed to demonstrate multiple experimental [Style-Bert-VITS2](https://github.com/litagin02/Style-Bert-VITS2) models made by Kit Lemonfoot. |
|
|
|
Do no evil. |
|
|
|
""" |
|
|
|
style_md = """ |
|
- You can control things like voice tone, emotion, and reading style through presets or through voice files. |
|
- Neutral acts as an average across all speakers. Styling options act as an override to Neutral. |
|
- Setting the intensity too high will likely break the output. |
|
- The required intensity will depend based on the speaker and the desired style. |
|
- If you're using preexisting audio data to style the output, try to use a voice that is similar to the desired speaker. |
|
""" |
|
|
|
|
|
def make_interactive(): |
|
return gr.update(interactive=True, value="Synthesize") |
|
|
|
|
|
def make_non_interactive(): |
|
return gr.update(interactive=False, value="Synthesize (Please load a model!)") |
|
|
|
|
|
def gr_util(item): |
|
if item == "Select from presets": |
|
return (gr.update(visible=True), gr.Audio(visible=False, value=None)) |
|
else: |
|
return (gr.update(visible=False), gr.update(visible=True)) |
|
|
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--cpu", action="store_true", help="Use CPU instead of GPU") |
|
parser.add_argument( |
|
"--dir", "-d", type=str, help="Model directory", default=config.out_dir |
|
) |
|
args = parser.parse_args() |
|
model_dir = args.dir |
|
|
|
if args.cpu: |
|
device = "cpu" |
|
else: |
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
model_holder = ModelHolder(model_dir, device) |
|
|
|
languages = ["EN", "JP", "ZH"] |
|
|
|
model_names = model_holder.model_names |
|
if len(model_names) == 0: |
|
logger.error(f"No models found. Please place the model in {model_dir}.") |
|
sys.exit(1) |
|
initial_id = 0 |
|
initial_pth_files = model_holder.model_files_dict[model_names[initial_id]] |
|
|
|
with gr.Blocks(theme=gr.themes.Base(primary_hue="emerald", secondary_hue="green"), title="LemonfootSBV2") as app: |
|
gr.Markdown(initial_md) |
|
with gr.Row(): |
|
with gr.Column(): |
|
with gr.Row(): |
|
with gr.Column(scale=3): |
|
model_name = gr.Dropdown( |
|
label="Available Models", |
|
choices=model_names, |
|
value=model_names[initial_id], |
|
) |
|
model_path = gr.Dropdown( |
|
label="Model File", |
|
choices=initial_pth_files, |
|
value=initial_pth_files[0], |
|
) |
|
refresh_button = gr.Button("Refresh", scale=1, visible=not is_hf_spaces) |
|
load_button = gr.Button("Load", scale=1, variant="primary") |
|
text_input = gr.TextArea(label="Text", value=initial_text) |
|
|
|
line_split = gr.Checkbox(label="Divide text seperately by line breaks", value=True) |
|
split_interval = gr.Slider( |
|
minimum=0.0, |
|
maximum=2, |
|
value=0.5, |
|
step=0.1, |
|
label="Length of division seperation time (in seconds)", |
|
) |
|
language = gr.Dropdown(choices=languages, value="EN", label="Language") |
|
speaker = gr.Dropdown(label="Speaker") |
|
with gr.Accordion(label="Advanced Settings", open=False): |
|
sdp_ratio = gr.Slider( |
|
minimum=0, maximum=1, value=0.2, step=0.1, label="SDP Ratio" |
|
) |
|
noise_scale = gr.Slider( |
|
minimum=0.1, maximum=2, value=0.6, step=0.1, label="Noise" |
|
) |
|
noise_scale_w = gr.Slider( |
|
minimum=0.1, maximum=2, value=0.8, step=0.1, label="Noise_W" |
|
) |
|
length_scale = gr.Slider( |
|
minimum=0.1, maximum=2, value=1.0, step=0.1, label="Length" |
|
) |
|
use_style_text = gr.Checkbox(label="Use stylization text", value=False) |
|
style_text = gr.Textbox( |
|
label="Style text", |
|
placeholder="Why are you ignoring me? You're unforgivable and disgusting! I hope you die.", |
|
info="The voice will be similar in tone and emotion to the text, however inflection and tempo may be worse as a result.", |
|
visible=False, |
|
) |
|
style_text_weight = gr.Slider( |
|
minimum=0, |
|
maximum=1, |
|
value=0.7, |
|
step=0.1, |
|
label="Text stylization strength", |
|
visible=False, |
|
) |
|
use_style_text.change( |
|
lambda x: (gr.Textbox(visible=x), gr.Slider(visible=x)), |
|
inputs=[use_style_text], |
|
outputs=[style_text, style_text_weight], |
|
) |
|
with gr.Column(): |
|
with gr.Accordion("Styling Guide", open=False): |
|
gr.Markdown(style_md) |
|
style_mode = gr.Radio( |
|
["Select from presets", "Use an audio file"], |
|
label="Style Specification", |
|
value="Select from presets", |
|
) |
|
style = gr.Dropdown( |
|
label="Current style (Neutral is an average style)", |
|
choices=["Please load a model first!"], |
|
value="Please load a model first!", |
|
) |
|
style_weight = gr.Slider( |
|
minimum=0, |
|
maximum=50, |
|
value=5, |
|
step=0.1, |
|
label="Style strength", |
|
) |
|
ref_audio_path = gr.Audio(label="Reference Audio", type="filepath", visible=False) |
|
tts_button = gr.Button( |
|
"Synthesize (Please load a model!)", variant="primary", interactive=False |
|
) |
|
text_output = gr.Textbox(label="Info") |
|
audio_output = gr.Audio(label="Result") |
|
|
|
tts_button.click( |
|
tts_fn, |
|
inputs=[ |
|
model_name, |
|
model_path, |
|
text_input, |
|
language, |
|
ref_audio_path, |
|
sdp_ratio, |
|
noise_scale, |
|
noise_scale_w, |
|
length_scale, |
|
line_split, |
|
split_interval, |
|
style_text, |
|
style_text_weight, |
|
use_style_text, |
|
style, |
|
style_weight, |
|
speaker, |
|
], |
|
outputs=[text_output, audio_output], |
|
) |
|
|
|
model_name.change( |
|
model_holder.update_model_files_dropdown, |
|
inputs=[model_name], |
|
outputs=[model_path], |
|
) |
|
|
|
model_path.change(make_non_interactive, outputs=[tts_button]) |
|
|
|
refresh_button.click( |
|
model_holder.update_model_names_dropdown, |
|
outputs=[model_name, model_path, tts_button], |
|
) |
|
|
|
load_button.click( |
|
model_holder.load_model, |
|
inputs=[model_name, model_path], |
|
outputs=[style, tts_button, speaker], |
|
) |
|
|
|
style_mode.change( |
|
gr_util, |
|
inputs=[style_mode], |
|
outputs=[style, ref_audio_path], |
|
) |
|
|
|
app.launch(inbrowser=True) |
|
|