Spaces:

ShoukanLabs
/

Vokan

Running on Zero

App Files Files Community

Korakoe commited on Mar 19

Commit

d10c5e3

•

1 Parent(s): 0cad0bd

Create app.py

Browse files

Files changed (1) hide show

app.py +212 -0

app.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import gradio as gr
+import spaces
+from styletts2 import tts
+import re
+import numpy as np
+from scipy.io.wavfile import write
+import pyaudio
+import nltk
+nltk.download('punkt')
+from nltk.tokenize import word_tokenize
+import torch
+import phonemizer  # en-us
+INTRO = """
+<style>
+  .TitleContainer {
+    background-color: #ffff;
+    margin-bottom: 0rem;
+    margin-left: auto;
+    margin-right: auto;
+    width: 40%;
+    height: 30%;
+    border-radius: 10rem;
+    border: 0.5vw solid #ff593e;
+    text-align: center;
+    display: flex;
+    justify-content: center;
+    transition: .6s;
+  }
+  .TitleContainer:hover {
+    transform: scale(1.05);
+  }
+  .VokanLogo {
+    margin: auto;
+    display: block;
+  }
+</style>
+<div class="TitleContainer">
+      <img src="https://huggingface.co/spaces/ShoukanLabs/Vokan/resolve/main/Vokan.gif" class="VokanLogo">
+</div>
+<p align="center", style="font-size: 1vw; font-weight: bold; color: #ff593e;">A StyleTTS2 fine-tune, designed for expressiveness.</p>
+<hr>
+"""
+js_func = """
+function refresh() {
+    const url = new URL(window.location);
+    if (url.searchParams.get('__theme') !== 'light') {
+        url.searchParams.set('__theme', 'light');
+        window.location.href = url.href;
+    }
+}
+"""
+theme = gr.themes.Soft(
+    primary_hue=gr.themes.Color(c100="#ffd7d1", c200="#ff593e", c300="#ff593e", c400="#ff593e", c50="#fff0f0", c500="#ff593e", c600="#ea580c", c700="#c2410c", c800="#9a3412", c900="#7c2d12", c950="#6c2e12"),
+    secondary_hue="orange",
+    radius_size=gr.themes.Size(lg="20px", md="8px", sm="6px", xl="30px", xs="4px", xxl="40px", xxs="2px"),
+    font=[gr.themes.GoogleFont('M PLUS Rounded 1c'), 'ui-sans-serif', 'system-ui', 'sans-serif'],
+).set(
+    block_background_fill='*neutral_50'
+)
+global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us',
+                                                     preserve_punctuation=True,
+                                                     with_stress=True,
+                                                     language_switch="remove-flags",
+                                                     tie=False)
+def split_and_recombine_text(text, desired_length=200, max_length=300):
+    """Split text it into chunks of a desired length trying to keep sentences intact."""
+    # normalize text, remove redundant whitespace and convert non-ascii quotes to ascii
+    text = re.sub(r'\n\n+', '\n', text)
+    text = re.sub(r'\s+', ' ', text)
+    text = re.sub(r'[“”]', '"', text)
+    rv = []
+    in_quote = False
+    current = ""
+    split_pos = []
+    pos = -1
+    end_pos = len(text) - 1
+    def seek(delta):
+        nonlocal pos, in_quote, current
+        is_neg = delta < 0
+        for _ in range(abs(delta)):
+            if is_neg:
+                pos -= 1
+                current = current[:-1]
+            else:
+                pos += 1
+                current += text[pos]
+            if text[pos] == '"':
+                in_quote = not in_quote
+        return text[pos]
+    def peek(delta):
+        p = pos + delta
+        return text[p] if p < end_pos and p >= 0 else ""
+    def commit():
+        nonlocal rv, current, split_pos
+        rv.append(current)
+        current = ""
+        split_pos = []
+    while pos < end_pos:
+        c = seek(1)
+        # do we need to force a split?
+        if len(current) >= max_length:
+            if len(split_pos) > 0 and len(current) > (desired_length / 2):
+                # we have at least one sentence and we are over half the desired length, seek back to the last split
+                d = pos - split_pos[-1]
+                seek(-d)
+            else:
+                # no full sentences, seek back until we are not in the middle of a word and split there
+                while c not in '!?.\n ' and pos > 0 and len(current) > desired_length:
+                    c = seek(-1)
+            commit()
+        # check for sentence boundaries
+        elif not in_quote and (c in '!?\n' or (c == '.' and peek(1) in '\n ')):
+            # seek forward if we have consecutive boundary markers but still within the max length
+            while pos < len(text) - 1 and len(current) < max_length and peek(1) in '!?.':
+                c = seek(1)
+            split_pos.append(pos)
+            if len(current) >= desired_length:
+                commit()
+        # treat end of quote as a boundary if its followed by a space or newline
+        elif in_quote and peek(1) == '"' and peek(2) in '\n ':
+            seek(2)
+            split_pos.append(pos)
+    rv.append(current)
+    # clean up, remove lines with only whitespace or punctuation
+    rv = [s.strip() for s in rv]
+    rv = [s for s in rv if len(s) > 0 and not re.match(r'^[\s\.,;:!?]*$', s)]
+    return rv
+def text_to_phonemes(text):
+    text = text.strip()
+    print("Text before phonemization: ", text)
+    ps = global_phonemizer.phonemize([text])
+    print("Text after phonemization: ", ps)
+    ps = word_tokenize(ps[0])
+    ps = ' '.join(ps)
+    print("Final text after tokenization: ", ps)
+    return ps
+@spaces.GPU
+def generate(audio_path, ins, speed, alpha, beta, embedding, steps=100):
+    ref_s = other_tts.compute_style(audio_path)
+    print(ref_s.size())
+    s_prev = None
+    texts = split_and_recombine_text(ins)
+    audio = np.array([])
+    P = pyaudio.PyAudio()
+    for i in texts:
+        i = text_to_phonemes(i)
+        synthaud, s_prev = other_tts.long_inference_segment(i, diffusion_steps=steps,
+                                                            alpha=alpha, beta=beta, is_phonemes=True,
+                                                            embedding_scale=embedding, prev_s=s_prev, ref_s=ref_s,
+                                                            speed=speed, t=0.7)
+        audio = np.concatenate((audio, synthaud))
+        scaled = np.int16(audio / np.max(np.abs(audio)) * 32767)
+        return 24000, scaled
+if torch.cuda.is_available():
+    other_tts = tts.StyleTTS2(model_checkpoint_path='./epoch_2nd_00012.pth', config_path="models/config_ft.yml")
+else:
+    other_tts = None
+with gr.Blocks(theme=theme, js=js_func) as clone:
+    gr.HTML(INTRO)
+    with gr.Row():
+        with gr.Column(scale=1):
+            inp = gr.Textbox(label="Text", info="What do you want Vokan to say?", interactive=True)
+            voice = gr.Audio(label="Voice", interactive=True, type='filepath', max_length=300, waveform_options={'waveform_progress_color': '#FF593E'})
+            steps = gr.Slider(minimum=3, maximum=60, value=20, step=1, label="Diffusion Steps", info="Higher produces better results typically", interactive=True)
+            embscale = gr.Slider(minimum=1, maximum=10, value=2, step=0.1, label="Embedding Scale", info="Defaults to 2 | low scales may produce unexpected results", interactive=True)
+            alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha", info="Defaults to 0.3", interactive=True)
+            beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1, label="Beta", info="Defaults to 0.7", interactive=True)
+            speed = gr.Slider(minimum=0.5, maximum=1.5, value=1, step=0.1, label="Speed of speech", info="Defaults to 1", interactive=True)
+        with gr.Column(scale=1):
+            clbtn = gr.Button("Synthesize", variant="primary")
+            claudio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#FF593E'})
+            clbtn.click(generate, inputs=[voice, inp, speed, alpha, beta, embscale, steps], outputs=[claudio], concurrency_limit=4)
+if __name__ == "__main__":
+    # demo.queue(api_open=False, max_size=15).launch(show_api=False)
+    clone.queue(api_open=False, max_size=15).launch(show_api=False)