File size: 4,619 Bytes
1373f78 f5915fd 635f007 d430de8 fde7bda 85fcd3c 73dbaa9 1373f78 635f007 1373f78 635f007 1373f78 e2e4977 1373f78 6eb9ea3 1373f78 fde7bda 1373f78 e27b102 138b27f fbe2075 85fcd3c e27b102 1373f78 6440f80 d430de8 635f007 04c285e d430de8 1373f78 addff22 d430de8 1373f78 50a9d0f 1373f78 635f007 304fce9 1373f78 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import os
import gradio as gr
import torch
from styletts2importable import compute_style, device, inference
from txtsplit import txtsplit
import numpy as np
import phonemizer
theme = gr.themes.Base(
font=[
gr.themes.GoogleFont("Libre Franklin"),
gr.themes.GoogleFont("Public Sans"),
"system-ui",
"sans-serif",
],
)
voicelist = [
"f-us-1",
"f-us-2",
"f-us-3",
"f-us-4",
"m-us-1",
"m-us-2",
"m-us-3",
"m-us-4",
]
voices = {}
global_phonemizer = phonemizer.backend.EspeakBackend(
language="en-us", preserve_punctuation=True, with_stress=True
)
# else:
for v in voicelist:
cache_path = f"voices/{v}.wav.npy"
if os.path.exists(cache_path):
voices[v] = torch.from_numpy(np.load(cache_path)).to(device)
else:
style = compute_style(f"voices/{v}.wav")
voices[v] = style
np.save(cache_path, style.cpu().numpy())
def synthesize(text, voice, lngsteps):
if text.strip() == "":
raise gr.Error("You must enter some text")
if len(text) > 50000:
raise gr.Error("Text must be <50k characters")
print("*** saying ***")
print(text)
print("*** end ***")
texts = txtsplit(text)
v = voice.lower()
audios = []
for t in texts:
audios.append(
inference(
t,
voices[v],
alpha=0.3,
beta=0.7,
diffusion_steps=lngsteps,
embedding_scale=1,
)
)
return (24000, np.concatenate(audios))
with gr.Blocks() as vctk:
with gr.Row():
with gr.Column(scale=1):
inp = gr.Textbox(
label="Text",
info="What would you like StyleTTS 2 to read? It works better on full sentences.",
interactive=True,
)
voice = gr.Dropdown(
voicelist,
label="Voice",
info="Select a default voice.",
value="m-us-2",
interactive=True,
)
multispeakersteps = gr.Slider(
minimum=3,
maximum=15,
value=3,
step=1,
label="Diffusion Steps",
info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster",
interactive=True,
)
# use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
with gr.Column(scale=1):
btn = gr.Button("Synthesize", variant="primary")
audio = gr.Audio(
interactive=False,
label="Synthesized Audio",
waveform_options={"waveform_progress_color": "#3C82F6"},
)
btn.click(
synthesize,
inputs=[inp, voice, multispeakersteps],
outputs=[audio],
concurrency_limit=4,
)
with gr.Blocks(
title="StyleTTS 2", css="footer{display:none !important}", theme=theme
) as demo:
gr.TabbedInterface(
[vctk], ["Multi-Voice", "Voice Cloning", "LJSpeech", "Long Text [Beta]"]
)
if __name__ == "__main__":
# demo.queue(api_open=False, max_size=15).launch(show_api=False)
print("Launching")
# start_time = time.time()
# synthesize(
# "defines how the endpoint appears in the API docs. Can be a string, None, or False. If set to a string, the endpoint will be exposed in the API docs with the given name. If None (default), the name of the function will be used as the API endpoint. If False, the endpoint will not be exposed in the API docs and downstream apps (including those that gr.load this app) will not be able to use this event.",
# "m-us-2",
# 3,
# )
# print(f"Launched in {time.time() - start_time} seconds")
# second_start_time = time.time()
# synthesize(
# "defines how the endpoint appears in the API docs. Can be a string, None, or False. If set to a string, the endpoint will be exposed in the API docs with the given name. If None (default), the name of the function will be used as the API endpoint. If False, the endpoint will not be exposed in the API docs and downstream apps (including those that gr.load this app) will not be able to use this event.",
# "m-us-2",
# 3,
# )
# print(f"Launched in {time.time() - second_start_time} seconds")
demo.queue(api_open=True, max_size=None).launch(show_api=False)
print("Launched")
|