File size: 4,619 Bytes
1373f78
f5915fd
635f007
d430de8
fde7bda
85fcd3c
73dbaa9
1373f78
 
 
635f007
1373f78
 
 
 
 
 
635f007
1373f78
 
 
 
 
 
 
 
 
 
e2e4977
1373f78
 
 
 
6eb9ea3
 
1373f78
 
fde7bda
1373f78
 
 
 
 
 
 
e27b102
 
138b27f
 
fbe2075
 
 
85fcd3c
e27b102
 
1373f78
 
 
 
 
 
 
 
 
 
 
6440f80
d430de8
635f007
04c285e
d430de8
 
1373f78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
addff22
d430de8
 
1373f78
 
 
 
 
 
 
 
 
 
 
50a9d0f
1373f78
 
 
 
 
 
635f007
304fce9
1373f78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import os

import gradio as gr
import torch
from styletts2importable import compute_style, device, inference
from txtsplit import txtsplit
import numpy as np
import phonemizer


theme = gr.themes.Base(
    font=[
        gr.themes.GoogleFont("Libre Franklin"),
        gr.themes.GoogleFont("Public Sans"),
        "system-ui",
        "sans-serif",
    ],
)
voicelist = [
    "f-us-1",
    "f-us-2",
    "f-us-3",
    "f-us-4",
    "m-us-1",
    "m-us-2",
    "m-us-3",
    "m-us-4",
]
voices = {}

global_phonemizer = phonemizer.backend.EspeakBackend(
    language="en-us", preserve_punctuation=True, with_stress=True
)
# else:
for v in voicelist:
    cache_path = f"voices/{v}.wav.npy"
    if os.path.exists(cache_path):
        voices[v] = torch.from_numpy(np.load(cache_path)).to(device)
    else:
        style = compute_style(f"voices/{v}.wav")
        voices[v] = style
        np.save(cache_path, style.cpu().numpy())


def synthesize(text, voice, lngsteps):
    if text.strip() == "":
        raise gr.Error("You must enter some text")
    if len(text) > 50000:
        raise gr.Error("Text must be <50k characters")
    print("*** saying ***")
    print(text)
    print("*** end ***")
    texts = txtsplit(text)
    v = voice.lower()
    audios = []
    for t in texts:
        audios.append(
            inference(
                t,
                voices[v],
                alpha=0.3,
                beta=0.7,
                diffusion_steps=lngsteps,
                embedding_scale=1,
            )
        )
    return (24000, np.concatenate(audios))


with gr.Blocks() as vctk:
    with gr.Row():
        with gr.Column(scale=1):
            inp = gr.Textbox(
                label="Text",
                info="What would you like StyleTTS 2 to read? It works better on full sentences.",
                interactive=True,
            )
            voice = gr.Dropdown(
                voicelist,
                label="Voice",
                info="Select a default voice.",
                value="m-us-2",
                interactive=True,
            )
            multispeakersteps = gr.Slider(
                minimum=3,
                maximum=15,
                value=3,
                step=1,
                label="Diffusion Steps",
                info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster",
                interactive=True,
            )
            # use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
        with gr.Column(scale=1):
            btn = gr.Button("Synthesize", variant="primary")
            audio = gr.Audio(
                interactive=False,
                label="Synthesized Audio",
                waveform_options={"waveform_progress_color": "#3C82F6"},
            )
            btn.click(
                synthesize,
                inputs=[inp, voice, multispeakersteps],
                outputs=[audio],
                concurrency_limit=4,
            )

with gr.Blocks(
    title="StyleTTS 2", css="footer{display:none !important}", theme=theme
) as demo:
    gr.TabbedInterface(
        [vctk], ["Multi-Voice", "Voice Cloning", "LJSpeech", "Long Text [Beta]"]
    )
if __name__ == "__main__":
    # demo.queue(api_open=False, max_size=15).launch(show_api=False)
    print("Launching")
    # start_time = time.time()
    # synthesize(
    #     "defines how the endpoint appears in the API docs. Can be a string, None, or False. If set to a string, the endpoint will be exposed in the API docs with the given name. If None (default), the name of the function will be used as the API endpoint. If False, the endpoint will not be exposed in the API docs and downstream apps (including those that gr.load this app) will not be able to use this event.",
    #     "m-us-2",
    #     3,
    # )
    # print(f"Launched in {time.time() - start_time} seconds")
    # second_start_time = time.time()
    # synthesize(
    #     "defines how the endpoint appears in the API docs. Can be a string, None, or False. If set to a string, the endpoint will be exposed in the API docs with the given name. If None (default), the name of the function will be used as the API endpoint. If False, the endpoint will not be exposed in the API docs and downstream apps (including those that gr.load this app) will not be able to use this event.",
    #     "m-us-2",
    #     3,
    # )
    # print(f"Launched in {time.time() - second_start_time} seconds")
    demo.queue(api_open=True, max_size=None).launch(show_api=False)
    print("Launched")