Hololive-Style-Bert-VITS2

Running

File size: 17,526 Bytes

import argparse
import datetime
import os
import sys
import warnings
import json

import gradio as gr
import numpy as np
import torch
from gradio.processing_utils import convert_to_16_bit_wav

import utils
from config import config
from infer import get_net_g, infer
from tools.log import logger

is_hf_spaces = os.getenv("SYSTEM") == "spaces"
limit = 150


class Model:
    def __init__(self, model_path, config_path, style_vec_path, device):
        self.model_path = model_path
        self.config_path = config_path
        self.device = device
        self.style_vec_path = style_vec_path
        self.load()

    def load(self):
        self.hps = utils.get_hparams_from_file(self.config_path)
        self.spk2id = self.hps.data.spk2id
        self.num_styles = self.hps.data.num_styles
        if hasattr(self.hps.data, "style2id"):
            self.style2id = self.hps.data.style2id
        else:
            self.style2id = {str(i): i for i in range(self.num_styles)}

        self.style_vectors = np.load(self.style_vec_path)
        self.net_g = None

    def load_net_g(self):
        self.net_g = get_net_g(
            model_path=self.model_path,
            version=self.hps.version,
            device=self.device,
            hps=self.hps,
        )

    def get_style_vector(self, style_id, weight=1.0):
        mean = self.style_vectors[0]
        style_vec = self.style_vectors[style_id]
        style_vec = mean + (style_vec - mean) * weight
        return style_vec

    def get_style_vector_from_audio(self, audio_path, weight=1.0):
        from style_gen import extract_style_vector

        xvec = extract_style_vector(audio_path)
        mean = self.style_vectors[0]
        xvec = mean + (xvec - mean) * weight
        return xvec

    def infer(
        self,
        text,
        language="JP",
        sid=0,
        reference_audio_path=None,
        sdp_ratio=0.2,
        noise=0.6,
        noisew=0.8,
        length=1.0,
        line_split=True,
        split_interval=0.2,
        style_text="",
        style_weight=0.7,
        use_style_text=False,
        style="0",
        emotion_weight=1.0,
    ):
        if reference_audio_path == "":
            reference_audio_path = None
        if style_text == "" or not use_style_text:
            style_text = None

        if self.net_g is None:
            self.load_net_g()
        if reference_audio_path is None:
            style_id = self.style2id[style]
            style_vector = self.get_style_vector(style_id, emotion_weight)
        else:
            style_vector = self.get_style_vector_from_audio(
                reference_audio_path, emotion_weight
            )
        if not line_split:
            with torch.no_grad():
                audio = infer(
                    text=text,
                    sdp_ratio=sdp_ratio,
                    noise_scale=noise,
                    noise_scale_w=noisew,
                    length_scale=length,
                    sid=sid,
                    language=language,
                    hps=self.hps,
                    net_g=self.net_g,
                    device=self.device,
                    style_text=style_text,
                    style_weight=style_weight,
                    style_vec=style_vector,
                )
        else:
            texts = text.split("\n")
            texts = [t for t in texts if t != ""]
            audios = []
            with torch.no_grad():
                for i, t in enumerate(texts):
                    audios.append(
                        infer(
                            text=t,
                            sdp_ratio=sdp_ratio,
                            noise_scale=noise,
                            noise_scale_w=noisew,
                            length_scale=length,
                            sid=sid,
                            language=language,
                            hps=self.hps,
                            net_g=self.net_g,
                            device=self.device,
                            style_text=style_text,
                            style_weight=style_weight,
                            style_vec=style_vector,
                        )
                    )
                    if i != len(texts) - 1:
                        audios.append(np.zeros(int(44100 * split_interval)))
                audio = np.concatenate(audios)
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                audio = convert_to_16_bit_wav(audio)
        return (self.hps.data.sampling_rate, audio)


class ModelHolder:
    def __init__(self, root_dir, device):
        self.root_dir = root_dir
        self.device = device
        self.model_files_dict = {}
        self.current_model = None
        self.model_names = []
        self.models = []
        self.refresh()

    def refresh(self):
        self.model_files_dict = {}
        self.model_names = []
        self.current_model = None
        model_dirs = [
            d
            for d in os.listdir(self.root_dir)
            if os.path.isdir(os.path.join(self.root_dir, d))
        ]
        for model_name in model_dirs:
            model_dir = os.path.join(self.root_dir, model_name)
            model_files = [
                os.path.join(model_dir, f)
                for f in os.listdir(model_dir)
                if f.endswith(".pth") or f.endswith(".pt") or f.endswith(".safetensors")
            ]
            if len(model_files) == 0:
                logger.info(
                    f"No model files found in {self.root_dir}/{model_name}, so skip it"
                )
            self.model_files_dict[model_name] = model_files
            self.model_names.append(model_name)

    def load_model(self, model_name, model_path):
        if model_name not in self.model_files_dict:
            raise Exception(f"モデル名{model_name}は存在しません")
        if model_path not in self.model_files_dict[model_name]:
            raise Exception(f"pthファイル{model_path}は存在しません")
        self.current_model = Model(
            model_path=model_path,
            config_path=os.path.join(self.root_dir, model_name, "config.json"),
            style_vec_path=os.path.join(self.root_dir, model_name, "style_vectors.npy"),
            device=self.device,
        )
        styles = list(self.current_model.style2id.keys())
        speakers = list(self.current_model.spk2id.keys())
        return (
            gr.Dropdown(choices=styles, value=styles[0]),
            gr.update(interactive=True, value="Synthesize"),
            gr.Dropdown(choices=speakers, value=speakers[0]),
        )

    def update_model_files_dropdown(self, model_name):
        model_files = self.model_files_dict[model_name]
        return gr.Dropdown(choices=model_files, value=model_files[0])

    def update_model_names_dropdown(self):
        self.refresh()
        initial_model_name = self.model_names[0]
        initial_model_files = self.model_files_dict[initial_model_name]
        return (
            gr.Dropdown(choices=self.model_names, value=initial_model_name),
            gr.Dropdown(choices=initial_model_files, value=initial_model_files[0]),
            gr.update(interactive=False),  # For tts_button
        )


def tts_fn(
    model_name,
    model_path,
    text,
    language,
    reference_audio_path,
    sdp_ratio,
    noise_scale,
    noise_scale_w,
    length_scale,
    line_split,
    split_interval,
    style_text,
    style_weight,
    use_style_text,
    emotion,
    emotion_weight,
    speaker,
):
    if not text:
        return "Please enter some text.", (44100, None)
    #logger.info(f"Start TTS with {language}:\n{text}")
    #logger.info(f"Model: {model_holder.current_model.model_path}")
    #logger.info(f"SDP: {sdp_ratio}, Noise: {noise_scale}, Noise_W: {noise_scale_w}, Length: {length_scale}")
    #logger.info(f"Style text enabled: {use_style_text}, Style text: {style_text}, Style weight: {style_weight}")
    #logger.info(f"Style: {emotion}, Style weight: {emotion_weight}")
    
    if is_hf_spaces and len(text) > limit:
        return f"Too long! There is a character limit of {limit} characters.", (44100, None)

    if(not model_holder.current_model):
        model_holder.load_model(model_name, model_path)

    if(model_holder.current_model.model_path != model_path):
        model_holder.load_model(model_name, model_path)
    
    speaker_id = model_holder.current_model.spk2id[speaker]

    start_time = datetime.datetime.now()

    sr, audio = model_holder.current_model.infer(
        text=text,
        language=language,
        sid=speaker_id,
        reference_audio_path=reference_audio_path,
        sdp_ratio=sdp_ratio,
        noise=noise_scale,
        noisew=noise_scale_w,
        length=length_scale,
        line_split=line_split,
        split_interval=split_interval,
        style_text=style_text,
        style_weight=style_weight,
        use_style_text=use_style_text,
        style=emotion,
        emotion_weight=emotion_weight,
    )

    end_time = datetime.datetime.now()
    duration = (end_time - start_time).total_seconds()
    logger.info(f"Successful inference, took {duration}s | {speaker} | {sdp_ratio}/{noise_scale}/{noise_scale_w}/{length_scale}/{emotion}/{emotion_weight} | {text}")
    return f"Success, time: {duration} seconds.", (sr, audio)

def load_voicedata():
    logger.info("Loading voice data...")
    voices = []
    styledict = {}
    with open("voicelist.json", "r", encoding="utf-8") as f:
        voc_info = json.load(f)
    for name, info in voc_info.items():
        if not info['enable']:
            continue
        model_path = info['model_path']
        voice_name = info['title']
        speakerid = info['speakerid']
        image = info['cover']
        if not model_path in styledict.keys():
           conf=f"model_assets/{model_path}/config.json"
           hps = utils.get_hparams_from_file(conf)
           s2id = hps.data.style2id
           styledict[model_path] = s2id.keys()
        voices.append((name, model_path, voice_name, speakerid, image))
    return voices, styledict
        

initial_text = "Hello there! This is test audio of Lemonfoot S B V 2."

initial_md = """
# LemonfootSBV2 😊🍋
### Space by [Kit Lemonfoot](https://huggingface.co/Kit-Lemonfoot)/[Noel Shirogane's High Flying Birds](https://www.youtube.com/channel/UCG9A0OJsJTluLOXfMZjJ9xA)
### Based on code originally by [fishaudio](https://github.com/fishaudio) and [litagin02](https://github.com/litagin02)
This HuggingFace space is designed to demonstrate multiple experimental [Style-Bert-VITS2](https://github.com/litagin02/Style-Bert-VITS2) models made by Kit Lemonfoot.

Do no evil.

"""

style_md = """
- You can control things like voice tone, emotion, and reading style through presets or through voice files.
- Neutral acts as an average across all speakers. Styling options act as an override to Neutral.
- Setting the intensity too high will likely break the output.
- The required intensity will depend based on the speaker and the desired style.
- If you're using preexisting audio data to style the output, try to use a voice that is similar to the desired speaker.
"""


def make_interactive():
    return gr.update(interactive=True, value="Synthesize")


def make_non_interactive():
    return gr.update(interactive=False, value="Synthesize (Please load a model!)")


def gr_util(item):
    if item == "Select from presets":
        return (gr.update(visible=True), gr.Audio(visible=False, value=None))
    else:
        return (gr.update(visible=False), gr.update(visible=True))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--cpu", action="store_true", help="Use CPU instead of GPU")
    parser.add_argument(
        "--dir", "-d", type=str, help="Model directory", default=config.out_dir
    )
    args = parser.parse_args()
    model_dir = args.dir

    if args.cpu:
        device = "cpu"
    else:
        device = "cuda" if torch.cuda.is_available() else "cpu"

    model_holder = ModelHolder(model_dir, device)

    languages = ["EN", "JP", "ZH"]

    model_names = model_holder.model_names
    if len(model_names) == 0:
        logger.error(f"No models found. Please place the model in {model_dir}.")
        sys.exit(1)
    initial_id = 0
    initial_pth_files = model_holder.model_files_dict[model_names[initial_id]]
    print(initial_pth_files)

    voicedata, styledict = load_voicedata()

    #Gradio preload
    text_input = gr.TextArea(label="Text", value=initial_text)
    line_split = gr.Checkbox(label="Divide text seperately by line breaks", value=True)
    split_interval = gr.Slider(
        minimum=0.0,
        maximum=2,
        value=0.5,
        step=0.1,
        label="Length of division seperation time (in seconds)",
    )
    language = gr.Dropdown(choices=languages, value="EN", label="Language")
    sdp_ratio = gr.Slider(
        minimum=0, maximum=1, value=0.2, step=0.1, label="SDP Ratio"
    )
    noise_scale = gr.Slider(
        minimum=0.1, maximum=2, value=0.6, step=0.1, label="Noise"
    )
    noise_scale_w = gr.Slider(
        minimum=0.1, maximum=2, value=0.8, step=0.1, label="Noise_W"
    )
    length_scale = gr.Slider(
        minimum=0.1, maximum=2, value=1.0, step=0.1, label="Length"
    )
    use_style_text = gr.Checkbox(label="Use stylization text", value=False)
    style_text = gr.Textbox(
        label="Style text",
        placeholder="Check the \"Use styleization text\" box to use this option!",
        info="The voice will be similar in tone and emotion to the text, however inflection and tempo may be worse as a result.",
        visible=True,
    )
    style_text_weight = gr.Slider(
        minimum=0,
        maximum=1,
        value=0.7,
        step=0.1,
        label="Text stylization strength",
        visible=True,
    )
    

    with gr.Blocks(theme=gr.themes.Base(primary_hue="emerald", secondary_hue="green"), title="LemonfootSBV2") as app:
        gr.Markdown(initial_md)

        for (name, model_path, voice_name, speakerid, image) in voicedata:
            with gr.TabItem(name):
                mn = gr.Textbox(value=model_path, visible=False, interactive=False)
                mp = gr.Textbox(value=f"model_assets\\{model_path}\\{model_path}.safetensors", visible=False, interactive=False)
                spk = gr.Textbox(value=speakerid, visible=False, interactive=False)
                with gr.Row():
                    with gr.Column():
                        gr.Markdown(f"**{voice_name}**\n\nModel name: {model_path}")
                        gr.Image(f"images/{image}", label=None, show_label=False, width=300, show_download_button=False, container=False)
                    with gr.Column():
                        with gr.TabItem("Preset Styles"):
                            style = gr.Dropdown(
                                label="Current style (Neutral is an average style)",
                                choices=styledict[model_path],
                                value="Neutral",
                            )
                        with gr.TabItem("Use an audio file"):
                            ref_audio_path = gr.Audio(label="Reference Audio", type="filepath")
                        style_weight = gr.Slider(
                            minimum=0,
                            maximum=50,
                            value=5,
                            step=0.1,
                            label="Style strength",
                        )
                    with gr.Column():
                        tts_button = gr.Button(
                            "Synthesize", variant="primary", interactive=True
                        )
                        text_output = gr.Textbox(label="Info")
                        audio_output = gr.Audio(label="Result")

                        tts_button.click(
                            tts_fn,
                            inputs=[
                                mn,
                                mp,
                                text_input,
                                language,
                                ref_audio_path,
                                sdp_ratio,
                                noise_scale,
                                noise_scale_w,
                                length_scale,
                                line_split,
                                split_interval,
                                style_text,
                                style_text_weight,
                                use_style_text,
                                style,
                                style_weight,
                                spk,
                            ],
                            outputs=[text_output, audio_output],
                        )

        with gr.Row():
            with gr.Column():
                text_input.render()
                line_split.render()
                split_interval.render()
                language.render()
            with gr.Column():
                sdp_ratio.render()
                noise_scale.render()
                noise_scale_w.render()
                length_scale.render()
                use_style_text.render()
                style_text.render()
                style_text_weight.render()

        with gr.Accordion("Styling Guide", open=False):
            gr.Markdown(style_md)

    app.launch(allowed_paths=['/file/images/'])