File size: 7,668 Bytes
d10c5e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
import gradio as gr
import spaces
from styletts2 import tts
import re
import numpy as np
from scipy.io.wavfile import write
import nltk

nltk.download('punkt')
from nltk.tokenize import word_tokenize

import torch

import phonemizer  # en-us

INTRO = """
<style>

  .TitleContainer {
    background-color: #ffff;
    margin-bottom: 0rem;
    margin-left: auto;
    margin-right: auto;
    width: 40%;
    height: 30%;
    border-radius: 10rem;
    border: 0.5vw solid #ff593e;
    text-align: center;
    display: flex;
    justify-content: center;
    transition: .6s;
  }

  .TitleContainer:hover {
    transform: scale(1.05);
  }

  .VokanLogo {
    margin: auto;
    display: block;
  }

</style>

<div class="TitleContainer">
      <img src="https://huggingface.co/spaces/ShoukanLabs/Vokan/resolve/main/Vokan.gif" class="VokanLogo">
</div>

<p align="center", style="font-size: 1vw; font-weight: bold; color: #ff593e;">A StyleTTS2 fine-tune, designed for expressiveness.</p>

<hr>
"""



js_func = """
function refresh() {
    const url = new URL(window.location);

    if (url.searchParams.get('__theme') !== 'light') {
        url.searchParams.set('__theme', 'light');
        window.location.href = url.href;
    }
}
"""

theme = gr.themes.Soft(
    primary_hue=gr.themes.Color(c100="#ffd7d1", c200="#ff593e", c300="#ff593e", c400="#ff593e", c50="#fff0f0", c500="#ff593e", c600="#ea580c", c700="#c2410c", c800="#9a3412", c900="#7c2d12", c950="#6c2e12"),
    secondary_hue="orange",
    radius_size=gr.themes.Size(lg="20px", md="8px", sm="6px", xl="30px", xs="4px", xxl="40px", xxs="2px"),
    font=[gr.themes.GoogleFont('M PLUS Rounded 1c'), 'ui-sans-serif', 'system-ui', 'sans-serif'],
).set(
    block_background_fill='*neutral_50'
)

global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us',
                                                     preserve_punctuation=True,
                                                     with_stress=True,
                                                     language_switch="remove-flags",
                                                     tie=False)


def split_and_recombine_text(text, desired_length=200, max_length=300):
    """Split text it into chunks of a desired length trying to keep sentences intact."""
    # normalize text, remove redundant whitespace and convert non-ascii quotes to ascii
    text = re.sub(r'\n\n+', '\n', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[“”]', '"', text)

    rv = []
    in_quote = False
    current = ""
    split_pos = []
    pos = -1
    end_pos = len(text) - 1

    def seek(delta):
        nonlocal pos, in_quote, current
        is_neg = delta < 0
        for _ in range(abs(delta)):
            if is_neg:
                pos -= 1
                current = current[:-1]
            else:
                pos += 1
                current += text[pos]
            if text[pos] == '"':
                in_quote = not in_quote
        return text[pos]

    def peek(delta):
        p = pos + delta
        return text[p] if p < end_pos and p >= 0 else ""

    def commit():
        nonlocal rv, current, split_pos
        rv.append(current)
        current = ""
        split_pos = []

    while pos < end_pos:
        c = seek(1)
        # do we need to force a split?
        if len(current) >= max_length:
            if len(split_pos) > 0 and len(current) > (desired_length / 2):
                # we have at least one sentence and we are over half the desired length, seek back to the last split
                d = pos - split_pos[-1]
                seek(-d)
            else:
                # no full sentences, seek back until we are not in the middle of a word and split there
                while c not in '!?.\n ' and pos > 0 and len(current) > desired_length:
                    c = seek(-1)
            commit()
        # check for sentence boundaries
        elif not in_quote and (c in '!?\n' or (c == '.' and peek(1) in '\n ')):
            # seek forward if we have consecutive boundary markers but still within the max length
            while pos < len(text) - 1 and len(current) < max_length and peek(1) in '!?.':
                c = seek(1)
            split_pos.append(pos)
            if len(current) >= desired_length:
                commit()
        # treat end of quote as a boundary if its followed by a space or newline
        elif in_quote and peek(1) == '"' and peek(2) in '\n ':
            seek(2)
            split_pos.append(pos)
    rv.append(current)

    # clean up, remove lines with only whitespace or punctuation
    rv = [s.strip() for s in rv]
    rv = [s for s in rv if len(s) > 0 and not re.match(r'^[\s\.,;:!?]*$', s)]

    return rv


def text_to_phonemes(text):
    text = text.strip()
    print("Text before phonemization: ", text)
    ps = global_phonemizer.phonemize([text])
    print("Text after phonemization: ", ps)
    ps = word_tokenize(ps[0])
    ps = ' '.join(ps)
    print("Final text after tokenization: ", ps)
    return ps


@spaces.GPU
def generate(audio_path, ins, speed, alpha, beta, embedding, steps=100):
    ref_s = other_tts.compute_style(audio_path)
    print(ref_s.size())
    s_prev = None

    texts = split_and_recombine_text(ins)
    audio = np.array([])

    for i in texts:
        i = text_to_phonemes(i)
        synthaud, s_prev = other_tts.long_inference_segment(i, diffusion_steps=steps,
                                                            alpha=alpha, beta=beta, is_phonemes=True,
                                                            embedding_scale=embedding, prev_s=s_prev, ref_s=ref_s,
                                                            speed=speed, t=0.7)
        audio = np.concatenate((audio, synthaud))
        scaled = np.int16(audio / np.max(np.abs(audio)) * 32767)

        return 24000, scaled

if torch.cuda.is_available():
    other_tts = tts.StyleTTS2(model_checkpoint_path='./epoch_2nd_00012.pth', config_path="models/config_ft.yml")
else:
    other_tts = None

with gr.Blocks(theme=theme, js=js_func) as clone:
    gr.HTML(INTRO)
    with gr.Row():
        with gr.Column(scale=1):
            inp = gr.Textbox(label="Text", info="What do you want Vokan to say?", interactive=True)
            voice = gr.Audio(label="Voice", interactive=True, type='filepath', max_length=300, waveform_options={'waveform_progress_color': '#FF593E'})
            steps = gr.Slider(minimum=3, maximum=60, value=20, step=1, label="Diffusion Steps", info="Higher produces better results typically", interactive=True)
            embscale = gr.Slider(minimum=1, maximum=10, value=2, step=0.1, label="Embedding Scale", info="Defaults to 2 | low scales may produce unexpected results", interactive=True)
            alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha", info="Defaults to 0.3", interactive=True)
            beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1, label="Beta", info="Defaults to 0.7", interactive=True)
            speed = gr.Slider(minimum=0.5, maximum=1.5, value=1, step=0.1, label="Speed of speech", info="Defaults to 1", interactive=True)
        with gr.Column(scale=1):
            clbtn = gr.Button("Synthesize", variant="primary")
            claudio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#FF593E'})
            clbtn.click(generate, inputs=[voice, inp, speed, alpha, beta, embscale, steps], outputs=[claudio], concurrency_limit=4)

if __name__ == "__main__":
    # demo.queue(api_open=False, max_size=15).launch(show_api=False)
    clone.queue(api_open=False, max_size=15).launch(show_api=False)