mrfakename commited on
Commit
6621613
1 Parent(s): a5cfbc2

Allow acronym expansion

Browse files
Files changed (2) hide show
  1. app.py +31 -13
  2. requirements.txt +3 -1
app.py CHANGED
@@ -6,6 +6,20 @@ import os
6
  # from tortoise.utils.text import split_and_recombine_text
7
  import numpy as np
8
  import pickle
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  theme = gr.themes.Base(
10
  font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'],
11
  )
@@ -20,12 +34,14 @@ global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_
20
  # else:
21
  for v in voicelist:
22
  voices[v] = styletts2importable.compute_style(f'voices/{v}.wav')
23
- def synthesize(text, voice, multispeakersteps):
24
  if text.strip() == "":
25
  raise gr.Error("You must enter some text")
26
  # if len(global_phonemizer.phonemize([text])) > 300:
27
  if len(text) > 300:
28
  raise gr.Error("Text must be under 300 characters")
 
 
29
  v = voice.lower()
30
  # return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=7, embedding_scale=1))
31
  return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=multispeakersteps, embedding_scale=1))
@@ -53,14 +69,14 @@ def clsynthesize(text, voice, vcsteps):
53
  raise gr.Error("Text must be under 300 characters")
54
  # return (24000, styletts2importable.inference(text, styletts2importable.compute_style(voice), alpha=0.3, beta=0.7, diffusion_steps=20, embedding_scale=1))
55
  return (24000, styletts2importable.inference(text, styletts2importable.compute_style(voice), alpha=0.3, beta=0.7, diffusion_steps=vcsteps, embedding_scale=1))
56
- def ljsynthesize(text):
57
  if text.strip() == "":
58
  raise gr.Error("You must enter some text")
59
  # if global_phonemizer.phonemize([text]) > 300:
60
  if len(text) > 300:
61
  raise gr.Error("Text must be under 300 characters")
62
  noise = torch.randn(1,1,256).to('cuda' if torch.cuda.is_available() else 'cpu')
63
- return (24000, ljspeechimportable.inference(text, noise, diffusion_steps=7, embedding_scale=1))
64
 
65
 
66
  with gr.Blocks() as vctk: # just realized it isn't vctk but libritts but i'm too lazy to change it rn
@@ -69,11 +85,12 @@ with gr.Blocks() as vctk: # just realized it isn't vctk but libritts but i'm too
69
  inp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
70
  voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-1', interactive=True)
71
  multispeakersteps = gr.Slider(minimum=5, maximum=15, value=7, step=1, label="Diffusion Steps", info="Higher = better quality, but slower", interactive=True)
 
72
  # use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
73
  with gr.Column(scale=1):
74
  btn = gr.Button("Synthesize", variant="primary")
75
  audio = gr.Audio(interactive=False, label="Synthesized Audio")
76
- btn.click(synthesize, inputs=[inp, voice, multispeakersteps], outputs=[audio], concurrency_limit=4)
77
  with gr.Blocks() as clone:
78
  with gr.Row():
79
  with gr.Column(scale=1):
@@ -83,7 +100,16 @@ with gr.Blocks() as clone:
83
  with gr.Column(scale=1):
84
  clbtn = gr.Button("Synthesize", variant="primary")
85
  claudio = gr.Audio(interactive=False, label="Synthesized Audio")
86
- clbtn.click(clsynthesize, inputs=[clinp, clvoice, vcsteps], outputs=[claudio], concurrency_limit=4)
 
 
 
 
 
 
 
 
 
87
  # with gr.Blocks() as longText:
88
  # with gr.Row():
89
  # with gr.Column(scale=1):
@@ -95,14 +121,6 @@ with gr.Blocks() as clone:
95
  # lngbtn = gr.Button("Synthesize", variant="primary")
96
  # lngaudio = gr.Audio(interactive=False, label="Synthesized Audio")
97
  # lngbtn.click(longsynthesize, inputs=[lnginp, lngvoice, lngsteps, lngpwd], outputs=[lngaudio], concurrency_limit=4)
98
- with gr.Blocks() as lj:
99
- with gr.Row():
100
- with gr.Column(scale=1):
101
- ljinp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
102
- with gr.Column(scale=1):
103
- ljbtn = gr.Button("Synthesize", variant="primary")
104
- ljaudio = gr.Audio(interactive=False, label="Synthesized Audio")
105
- ljbtn.click(ljsynthesize, inputs=[ljinp], outputs=[ljaudio], concurrency_limit=4)
106
  with gr.Blocks(title="StyleTTS 2", css="footer{display:none !important}", theme=theme) as demo:
107
  gr.Markdown("""# StyleTTS 2
108
 
 
6
  # from tortoise.utils.text import split_and_recombine_text
7
  import numpy as np
8
  import pickle
9
+ import spacy
10
+
11
+ from scispacy.abbreviation import AbbreviationDetector
12
+ nlp = spacy.load("en_core_sci_sm")
13
+
14
+ # Add the abbreviation pipe to the spacy pipeline.
15
+ nlp.add_pipe("abbreviation_detector")
16
+ def replace_acronyms(text):
17
+ doc = nlp(text)
18
+ altered_tok = [tok.text for tok in doc]
19
+ for abrv in doc._.abbreviations:
20
+ altered_tok[abrv.start] = str(abrv._.long_form)
21
+
22
+ return(" ".join(altered_tok))
23
  theme = gr.themes.Base(
24
  font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'],
25
  )
 
34
  # else:
35
  for v in voicelist:
36
  voices[v] = styletts2importable.compute_style(f'voices/{v}.wav')
37
+ def synthesize(text, voice, multispeakersteps, msexpand):
38
  if text.strip() == "":
39
  raise gr.Error("You must enter some text")
40
  # if len(global_phonemizer.phonemize([text])) > 300:
41
  if len(text) > 300:
42
  raise gr.Error("Text must be under 300 characters")
43
+ if msexpand:
44
+ text = replace_acronyms(text)
45
  v = voice.lower()
46
  # return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=7, embedding_scale=1))
47
  return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=multispeakersteps, embedding_scale=1))
 
69
  raise gr.Error("Text must be under 300 characters")
70
  # return (24000, styletts2importable.inference(text, styletts2importable.compute_style(voice), alpha=0.3, beta=0.7, diffusion_steps=20, embedding_scale=1))
71
  return (24000, styletts2importable.inference(text, styletts2importable.compute_style(voice), alpha=0.3, beta=0.7, diffusion_steps=vcsteps, embedding_scale=1))
72
+ def ljsynthesize(text, steps):
73
  if text.strip() == "":
74
  raise gr.Error("You must enter some text")
75
  # if global_phonemizer.phonemize([text]) > 300:
76
  if len(text) > 300:
77
  raise gr.Error("Text must be under 300 characters")
78
  noise = torch.randn(1,1,256).to('cuda' if torch.cuda.is_available() else 'cpu')
79
+ return (24000, ljspeechimportable.inference(text, noise, diffusion_steps=steps, embedding_scale=1))
80
 
81
 
82
  with gr.Blocks() as vctk: # just realized it isn't vctk but libritts but i'm too lazy to change it rn
 
85
  inp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
86
  voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-1', interactive=True)
87
  multispeakersteps = gr.Slider(minimum=5, maximum=15, value=7, step=1, label="Diffusion Steps", info="Higher = better quality, but slower", interactive=True)
88
+ msexpand = gr.Checkbox(label="Expand acronyms", info="Expand acronyms using SciSpacy algorithm")
89
  # use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
90
  with gr.Column(scale=1):
91
  btn = gr.Button("Synthesize", variant="primary")
92
  audio = gr.Audio(interactive=False, label="Synthesized Audio")
93
+ btn.click(synthesize, inputs=[inp, voice, multispeakersteps, msexpand], outputs=[audio], concurrency_limit=4)
94
  with gr.Blocks() as clone:
95
  with gr.Row():
96
  with gr.Column(scale=1):
 
100
  with gr.Column(scale=1):
101
  clbtn = gr.Button("Synthesize", variant="primary")
102
  claudio = gr.Audio(interactive=False, label="Synthesized Audio")
103
+ clbtn.click(clsynthesize, inputs=[clinp, clvoice, vcsteps], outputs=[claudio], concurrency_limit=2)
104
+ with gr.Blocks() as lj:
105
+ with gr.Row():
106
+ with gr.Column(scale=1):
107
+ ljinp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
108
+ with gr.Column(scale=1):
109
+ ljbtn = gr.Button("Synthesize", variant="primary")
110
+ ljaudio = gr.Audio(interactive=False, label="Synthesized Audio")
111
+ ljsteps = gr.Slider(minimum=5, maximum=15, value=7, step=1, label="Diffusion Steps", info="Higher = better quality, but slower", interactive=True)
112
+ ljbtn.click(ljsynthesize, inputs=[ljinp, ljsteps], outputs=[ljaudio], concurrency_limit=4)
113
  # with gr.Blocks() as longText:
114
  # with gr.Row():
115
  # with gr.Column(scale=1):
 
121
  # lngbtn = gr.Button("Synthesize", variant="primary")
122
  # lngaudio = gr.Audio(interactive=False, label="Synthesized Audio")
123
  # lngbtn.click(longsynthesize, inputs=[lnginp, lngvoice, lngsteps, lngpwd], outputs=[lngaudio], concurrency_limit=4)
 
 
 
 
 
 
 
 
124
  with gr.Blocks(title="StyleTTS 2", css="footer{display:none !important}", theme=theme) as demo:
125
  gr.Markdown("""# StyleTTS 2
126
 
requirements.txt CHANGED
@@ -20,4 +20,6 @@ phonemizer
20
  cached-path
21
  gradio
22
  gruut
23
- # tortoise-tts
 
 
 
20
  cached-path
21
  gradio
22
  gruut
23
+ # tortoise-tts
24
+ spacy
25
+ scispacy