Spaces:

ShoukanLabs
/

Vokan

Running on Zero

File size: 9,596 Bytes

import gradio as gr
import spaces
from styletts2 import tts
import re
import numpy as np
from scipy.io.wavfile import write
import nltk

nltk.download('punkt')
from nltk.tokenize import word_tokenize

import torch

import phonemizer  # en-us

INTRO = """
<style>

  .TitleContainer {
    background-color: #ffff;
    margin-bottom: 0rem;
    margin-left: auto;
    margin-right: auto;
    width: 40%;
    height: 30%;
    border-radius: 10rem;
    border: 0.5vw solid #ff593e;
    text-align: center;
    display: flex;
    justify-content: center;
    transition: .6s;
  }

  .TitleContainer:hover {
    transform: scale(1.05);
  }

  .VokanLogo {
    margin: auto;
    display: block;
  }

</style>

<div class="TitleContainer">
      <img src="https://huggingface.co/spaces/ShoukanLabs/Vokan/resolve/main/Vokan.gif" class="VokanLogo">
</div>

<p align="center", style="font-size: 1vw; font-weight: bold; color: #ff593e;">A StyleTTS2 fine-tune, designed for expressiveness.</p>

<hr>
"""

js_func = """
function refresh() {
    const url = new URL(window.location);

    if (url.searchParams.get('__theme') !== 'light') {
        url.searchParams.set('__theme', 'light');
        window.location.href = url.href;
    }
}
"""

examples = [
        ["./Examples/David Attenborough.wav",
        "An understanding of the natural world is a source of not only great curiosity, but great fulfilment.",
        1, 0.2, 0.5, 1, 200],
        ["./Examples/Linus Tech Tips.wav",
        "sometimes I get so in the zone while building a computer it's like an out of body experience.",
        1, 0.2, 0.8, 2, 200],
        ["./Examples/Melina.wav",
        "If you intend to claim the Frenzied Flame, I ask that you cease. It is not to be meddled with. It is chaos, "
        "devouring life and thought unending. However ruined this world has become, "
        "however mired in torment and despair, life endures.",
        0.95, 0.2, 0.5, 2, 200],
        ["./Examples/Patrick Bateman.wav",
        "My Pain Is Constant And Sharp, And I Do Not Wish For A Better World For Anyone.",
        1, 0.1, 0.3, 2, 200],
        ["./Examples/Furina.ogg",
        "That's more like it! As expected, my dazzling side comes through in any situation.",
        1, 0.2, 0.8, 2, 200]
]


theme = gr.themes.Soft(
    primary_hue=gr.themes.Color(c100="#ffd7d1", c200="#ff593e", c300="#ff593e", c400="#ff593e", c50="#fff0f0",
                                c500="#ff593e", c600="#ea580c", c700="#c2410c", c800="#9a3412", c900="#7c2d12",
                                c950="#6c2e12"),
    secondary_hue="orange",
    radius_size=gr.themes.Size(lg="20px", md="8px", sm="6px", xl="30px", xs="4px", xxl="40px", xxs="2px"),
    font=[gr.themes.GoogleFont('M PLUS Rounded 1c'), 'ui-sans-serif', 'system-ui', 'sans-serif'],
).set(
    block_background_fill='*neutral_50'
)

global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us',
                                                     preserve_punctuation=True,
                                                     with_stress=True,
                                                     language_switch="remove-flags",
                                                     tie=False)


def split_and_recombine_text(text, desired_length=200, max_length=300):
    """Split text it into chunks of a desired length trying to keep sentences intact."""
    # normalize text, remove redundant whitespace and convert non-ascii quotes to ascii
    text = re.sub(r'\n\n+', '\n', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[“”]', '"', text)

    rv = []
    in_quote = False
    current = ""
    split_pos = []
    pos = -1
    end_pos = len(text) - 1

    def seek(delta):
        nonlocal pos, in_quote, current
        is_neg = delta < 0
        for _ in range(abs(delta)):
            if is_neg:
                pos -= 1
                current = current[:-1]
            else:
                pos += 1
                current += text[pos]
            if text[pos] == '"':
                in_quote = not in_quote
        return text[pos]

    def peek(delta):
        p = pos + delta
        return text[p] if p < end_pos and p >= 0 else ""

    def commit():
        nonlocal rv, current, split_pos
        rv.append(current)
        current = ""
        split_pos = []

    while pos < end_pos:
        c = seek(1)
        # do we need to force a split?
        if len(current) >= max_length:
            if len(split_pos) > 0 and len(current) > (desired_length / 2):
                # we have at least one sentence and we are over half the desired length, seek back to the last split
                d = pos - split_pos[-1]
                seek(-d)
            else:
                # no full sentences, seek back until we are not in the middle of a word and split there
                while c not in '!?.\n ' and pos > 0 and len(current) > desired_length:
                    c = seek(-1)
            commit()
        # check for sentence boundaries
        elif not in_quote and (c in '!?\n' or (c == '.' and peek(1) in '\n ')):
            # seek forward if we have consecutive boundary markers but still within the max length
            while pos < len(text) - 1 and len(current) < max_length and peek(1) in '!?.':
                c = seek(1)
            split_pos.append(pos)
            if len(current) >= desired_length:
                commit()
        # treat end of quote as a boundary if its followed by a space or newline
        elif in_quote and peek(1) == '"' and peek(2) in '\n ':
            seek(2)
            split_pos.append(pos)
    rv.append(current)

    # clean up, remove lines with only whitespace or punctuation
    rv = [s.strip() for s in rv]
    rv = [s for s in rv if len(s) > 0 and not re.match(r'^[\s\.,;:!?]*$', s)]

    return rv


def text_to_phonemes(text):
    text = text.strip()
    print("Text before phonemization: ", text)
    ps = global_phonemizer.phonemize([text])
    print("Text after phonemization: ", ps)
    ps = word_tokenize(ps[0])
    ps = ' '.join(ps)
    print("Final text after tokenization: ", ps)
    return ps


@spaces.GPU
def generate(audio_path, ins, speed, alpha, beta, embedding, steps=100):
    ref_s = other_tts.compute_style(audio_path)
    print(ref_s.size())
    s_prev = None

    texts = split_and_recombine_text(ins)
    audio = np.array([])

    for i in texts:
        i = text_to_phonemes(i)
        synthaud, s_prev = other_tts.long_inference_segment(i, diffusion_steps=steps,
                                                            alpha=alpha, beta=beta, is_phonemes=True,
                                                            embedding_scale=embedding, prev_s=s_prev, ref_s=ref_s,
                                                            speed=speed, t=0.7)

        n_trim = int(len(synthaud) * 0.008) # 960 samples
        synthaud[:n_trim] = 0
        synthaud[-n_trim:] = 0
        audio = np.concatenate((audio, synthaud))
    scaled = np.int16(audio / np.max(np.abs(audio)) * 32767)

    
    


    return 24000, scaled


if torch.cuda.is_available():
    other_tts = tts.StyleTTS2(model_checkpoint_path='./epoch_2nd_00012.pth', config_path="models/config_ft.yml")
else:
    other_tts = None

with gr.Blocks(theme=theme, js=js_func) as clone:
    gr.HTML(INTRO)
    with gr.Row():
        with gr.Column(scale=1):
            inp = gr.Textbox(label="Text", info="What do you want Vokan to say? | Longform generation may produce artifacts in between sentences", interactive=True)
            voice = gr.Audio(label="Voice", interactive=True, type='filepath', max_length=1000,
                             waveform_options={'waveform_progress_color': '#FF593E'})
            steps = gr.Slider(minimum=3, maximum=200, value=20, step=1, label="Diffusion Steps",
                              info="Higher produces better results typically", interactive=True)
            embscale = gr.Slider(minimum=1, maximum=5, value=2, step=0.1, label="Embedding Scale",
                                 info="Defaults to 2 | high scales may produce unexpected results | Higher scales produce more emotion guided reults", interactive=True)
            alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha", info="Defaults to 0.3 | Lower = More similar in sound to speaker",
                              interactive=True)
            beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1, label="Beta", info="Defaults to 0.7 | Lower = More similar prosody at cost of stability",
                             interactive=True)
            speed = gr.Slider(minimum=0.5, maximum=1.5, value=1, step=0.1, label="Speed of speech",
                              info="Defaults to 1", interactive=True)
        with gr.Column(scale=1):
            clbtn = gr.Button("Synthesize", variant="primary")
            claudio = gr.Audio(interactive=False, label="Synthesized Audio",
                               waveform_options={'waveform_progress_color': '#FF593E'})
            clbtn.click(generate, inputs=[voice, inp, speed, alpha, beta, embscale, steps], outputs=[claudio],
                        concurrency_limit=15)

            gr.Examples(examples=examples,
                        inputs=[voice, inp, speed, alpha, beta, embscale, steps],
                        outputs=[claudio],
                        fn=generate,
                        cache_examples=True,)

if __name__ == "__main__":
    # demo.queue(api_open=False, max_size=15).launch(show_api=False)
    clone.queue(api_open=False, max_size=15).launch(show_api=False)