mrfakename
commited on
Commit
•
6621613
1
Parent(s):
a5cfbc2
Allow acronym expansion
Browse files- app.py +31 -13
- requirements.txt +3 -1
app.py
CHANGED
@@ -6,6 +6,20 @@ import os
|
|
6 |
# from tortoise.utils.text import split_and_recombine_text
|
7 |
import numpy as np
|
8 |
import pickle
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
theme = gr.themes.Base(
|
10 |
font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'],
|
11 |
)
|
@@ -20,12 +34,14 @@ global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_
|
|
20 |
# else:
|
21 |
for v in voicelist:
|
22 |
voices[v] = styletts2importable.compute_style(f'voices/{v}.wav')
|
23 |
-
def synthesize(text, voice, multispeakersteps):
|
24 |
if text.strip() == "":
|
25 |
raise gr.Error("You must enter some text")
|
26 |
# if len(global_phonemizer.phonemize([text])) > 300:
|
27 |
if len(text) > 300:
|
28 |
raise gr.Error("Text must be under 300 characters")
|
|
|
|
|
29 |
v = voice.lower()
|
30 |
# return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=7, embedding_scale=1))
|
31 |
return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=multispeakersteps, embedding_scale=1))
|
@@ -53,14 +69,14 @@ def clsynthesize(text, voice, vcsteps):
|
|
53 |
raise gr.Error("Text must be under 300 characters")
|
54 |
# return (24000, styletts2importable.inference(text, styletts2importable.compute_style(voice), alpha=0.3, beta=0.7, diffusion_steps=20, embedding_scale=1))
|
55 |
return (24000, styletts2importable.inference(text, styletts2importable.compute_style(voice), alpha=0.3, beta=0.7, diffusion_steps=vcsteps, embedding_scale=1))
|
56 |
-
def ljsynthesize(text):
|
57 |
if text.strip() == "":
|
58 |
raise gr.Error("You must enter some text")
|
59 |
# if global_phonemizer.phonemize([text]) > 300:
|
60 |
if len(text) > 300:
|
61 |
raise gr.Error("Text must be under 300 characters")
|
62 |
noise = torch.randn(1,1,256).to('cuda' if torch.cuda.is_available() else 'cpu')
|
63 |
-
return (24000, ljspeechimportable.inference(text, noise, diffusion_steps=
|
64 |
|
65 |
|
66 |
with gr.Blocks() as vctk: # just realized it isn't vctk but libritts but i'm too lazy to change it rn
|
@@ -69,11 +85,12 @@ with gr.Blocks() as vctk: # just realized it isn't vctk but libritts but i'm too
|
|
69 |
inp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
|
70 |
voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-1', interactive=True)
|
71 |
multispeakersteps = gr.Slider(minimum=5, maximum=15, value=7, step=1, label="Diffusion Steps", info="Higher = better quality, but slower", interactive=True)
|
|
|
72 |
# use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
|
73 |
with gr.Column(scale=1):
|
74 |
btn = gr.Button("Synthesize", variant="primary")
|
75 |
audio = gr.Audio(interactive=False, label="Synthesized Audio")
|
76 |
-
btn.click(synthesize, inputs=[inp, voice, multispeakersteps], outputs=[audio], concurrency_limit=4)
|
77 |
with gr.Blocks() as clone:
|
78 |
with gr.Row():
|
79 |
with gr.Column(scale=1):
|
@@ -83,7 +100,16 @@ with gr.Blocks() as clone:
|
|
83 |
with gr.Column(scale=1):
|
84 |
clbtn = gr.Button("Synthesize", variant="primary")
|
85 |
claudio = gr.Audio(interactive=False, label="Synthesized Audio")
|
86 |
-
clbtn.click(clsynthesize, inputs=[clinp, clvoice, vcsteps], outputs=[claudio], concurrency_limit=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
# with gr.Blocks() as longText:
|
88 |
# with gr.Row():
|
89 |
# with gr.Column(scale=1):
|
@@ -95,14 +121,6 @@ with gr.Blocks() as clone:
|
|
95 |
# lngbtn = gr.Button("Synthesize", variant="primary")
|
96 |
# lngaudio = gr.Audio(interactive=False, label="Synthesized Audio")
|
97 |
# lngbtn.click(longsynthesize, inputs=[lnginp, lngvoice, lngsteps, lngpwd], outputs=[lngaudio], concurrency_limit=4)
|
98 |
-
with gr.Blocks() as lj:
|
99 |
-
with gr.Row():
|
100 |
-
with gr.Column(scale=1):
|
101 |
-
ljinp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
|
102 |
-
with gr.Column(scale=1):
|
103 |
-
ljbtn = gr.Button("Synthesize", variant="primary")
|
104 |
-
ljaudio = gr.Audio(interactive=False, label="Synthesized Audio")
|
105 |
-
ljbtn.click(ljsynthesize, inputs=[ljinp], outputs=[ljaudio], concurrency_limit=4)
|
106 |
with gr.Blocks(title="StyleTTS 2", css="footer{display:none !important}", theme=theme) as demo:
|
107 |
gr.Markdown("""# StyleTTS 2
|
108 |
|
|
|
6 |
# from tortoise.utils.text import split_and_recombine_text
|
7 |
import numpy as np
|
8 |
import pickle
|
9 |
+
import spacy
|
10 |
+
|
11 |
+
from scispacy.abbreviation import AbbreviationDetector
|
12 |
+
nlp = spacy.load("en_core_sci_sm")
|
13 |
+
|
14 |
+
# Add the abbreviation pipe to the spacy pipeline.
|
15 |
+
nlp.add_pipe("abbreviation_detector")
|
16 |
+
def replace_acronyms(text):
|
17 |
+
doc = nlp(text)
|
18 |
+
altered_tok = [tok.text for tok in doc]
|
19 |
+
for abrv in doc._.abbreviations:
|
20 |
+
altered_tok[abrv.start] = str(abrv._.long_form)
|
21 |
+
|
22 |
+
return(" ".join(altered_tok))
|
23 |
theme = gr.themes.Base(
|
24 |
font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'],
|
25 |
)
|
|
|
34 |
# else:
|
35 |
for v in voicelist:
|
36 |
voices[v] = styletts2importable.compute_style(f'voices/{v}.wav')
|
37 |
+
def synthesize(text, voice, multispeakersteps, msexpand):
|
38 |
if text.strip() == "":
|
39 |
raise gr.Error("You must enter some text")
|
40 |
# if len(global_phonemizer.phonemize([text])) > 300:
|
41 |
if len(text) > 300:
|
42 |
raise gr.Error("Text must be under 300 characters")
|
43 |
+
if msexpand:
|
44 |
+
text = replace_acronyms(text)
|
45 |
v = voice.lower()
|
46 |
# return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=7, embedding_scale=1))
|
47 |
return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=multispeakersteps, embedding_scale=1))
|
|
|
69 |
raise gr.Error("Text must be under 300 characters")
|
70 |
# return (24000, styletts2importable.inference(text, styletts2importable.compute_style(voice), alpha=0.3, beta=0.7, diffusion_steps=20, embedding_scale=1))
|
71 |
return (24000, styletts2importable.inference(text, styletts2importable.compute_style(voice), alpha=0.3, beta=0.7, diffusion_steps=vcsteps, embedding_scale=1))
|
72 |
+
def ljsynthesize(text, steps):
|
73 |
if text.strip() == "":
|
74 |
raise gr.Error("You must enter some text")
|
75 |
# if global_phonemizer.phonemize([text]) > 300:
|
76 |
if len(text) > 300:
|
77 |
raise gr.Error("Text must be under 300 characters")
|
78 |
noise = torch.randn(1,1,256).to('cuda' if torch.cuda.is_available() else 'cpu')
|
79 |
+
return (24000, ljspeechimportable.inference(text, noise, diffusion_steps=steps, embedding_scale=1))
|
80 |
|
81 |
|
82 |
with gr.Blocks() as vctk: # just realized it isn't vctk but libritts but i'm too lazy to change it rn
|
|
|
85 |
inp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
|
86 |
voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-1', interactive=True)
|
87 |
multispeakersteps = gr.Slider(minimum=5, maximum=15, value=7, step=1, label="Diffusion Steps", info="Higher = better quality, but slower", interactive=True)
|
88 |
+
msexpand = gr.Checkbox(label="Expand acronyms", info="Expand acronyms using SciSpacy algorithm")
|
89 |
# use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
|
90 |
with gr.Column(scale=1):
|
91 |
btn = gr.Button("Synthesize", variant="primary")
|
92 |
audio = gr.Audio(interactive=False, label="Synthesized Audio")
|
93 |
+
btn.click(synthesize, inputs=[inp, voice, multispeakersteps, msexpand], outputs=[audio], concurrency_limit=4)
|
94 |
with gr.Blocks() as clone:
|
95 |
with gr.Row():
|
96 |
with gr.Column(scale=1):
|
|
|
100 |
with gr.Column(scale=1):
|
101 |
clbtn = gr.Button("Synthesize", variant="primary")
|
102 |
claudio = gr.Audio(interactive=False, label="Synthesized Audio")
|
103 |
+
clbtn.click(clsynthesize, inputs=[clinp, clvoice, vcsteps], outputs=[claudio], concurrency_limit=2)
|
104 |
+
with gr.Blocks() as lj:
|
105 |
+
with gr.Row():
|
106 |
+
with gr.Column(scale=1):
|
107 |
+
ljinp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
|
108 |
+
with gr.Column(scale=1):
|
109 |
+
ljbtn = gr.Button("Synthesize", variant="primary")
|
110 |
+
ljaudio = gr.Audio(interactive=False, label="Synthesized Audio")
|
111 |
+
ljsteps = gr.Slider(minimum=5, maximum=15, value=7, step=1, label="Diffusion Steps", info="Higher = better quality, but slower", interactive=True)
|
112 |
+
ljbtn.click(ljsynthesize, inputs=[ljinp, ljsteps], outputs=[ljaudio], concurrency_limit=4)
|
113 |
# with gr.Blocks() as longText:
|
114 |
# with gr.Row():
|
115 |
# with gr.Column(scale=1):
|
|
|
121 |
# lngbtn = gr.Button("Synthesize", variant="primary")
|
122 |
# lngaudio = gr.Audio(interactive=False, label="Synthesized Audio")
|
123 |
# lngbtn.click(longsynthesize, inputs=[lnginp, lngvoice, lngsteps, lngpwd], outputs=[lngaudio], concurrency_limit=4)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
with gr.Blocks(title="StyleTTS 2", css="footer{display:none !important}", theme=theme) as demo:
|
125 |
gr.Markdown("""# StyleTTS 2
|
126 |
|
requirements.txt
CHANGED
@@ -20,4 +20,6 @@ phonemizer
|
|
20 |
cached-path
|
21 |
gradio
|
22 |
gruut
|
23 |
-
# tortoise-tts
|
|
|
|
|
|
20 |
cached-path
|
21 |
gradio
|
22 |
gruut
|
23 |
+
# tortoise-tts
|
24 |
+
spacy
|
25 |
+
scispacy
|