Files changed (3) hide show
  1. README.md +0 -1
  2. app.py +37 -45
  3. requirements.txt +23 -24
README.md CHANGED
@@ -8,7 +8,6 @@ sdk_version: 4.5.0
8
  app_file: app.py
9
  pinned: true
10
  license: agpl-3.0
11
- short_description: Efficient, fast, and natural text to speech with StyleTTS 2!
12
  ---
13
 
14
  License: AGPLv3
 
8
  app_file: app.py
9
  pinned: true
10
  license: agpl-3.0
 
11
  ---
12
 
13
  License: AGPLv3
app.py CHANGED
@@ -1,23 +1,23 @@
1
- INTROTXT = """# StyleTTS 2
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- [Paper](https://arxiv.org/abs/2306.07691) - [Samples](https://styletts2.github.io/) - [Code](https://github.com/yl4579/StyleTTS2) - [Discord](https://discord.gg/ha8sxdG2K4)
4
 
5
- A free demo of StyleTTS 2. **I am not affiliated with the StyleTTS 2 Authors.**
6
-
7
- **Before using this demo, you agree to inform the listeners that the speech samples are synthesized by the pre-trained models, unless you have the permission to use the voice you synthesize. That is, you agree to only use voices whose speakers grant the permission to have their voice cloned, either directly or by license before making synthesized voices public, or you have to publicly announce that these voices are synthesized if you do not have the permission to use these voices.**
8
-
9
- Is there a long queue on this space? Duplicate it and add a more powerful GPU to skip the wait! **Note: Thank you to Hugging Face for their generous GPU grant program!**
10
-
11
- **NOTE: StyleTTS 2 does better on longer texts.** For example, making it say "hi" will produce a lower-quality result than making it say a longer phrase.
12
-
13
- **NOTE: StyleTTS 2 is _currently_ English-only. Join the Discord for updates on multilingual training.**
14
- """
15
  import gradio as gr
16
  import styletts2importable
17
  import ljspeechimportable
18
  import torch
19
  import os
20
- from txtsplit import txtsplit
21
  import numpy as np
22
  import pickle
23
  theme = gr.themes.Base(
@@ -43,20 +43,15 @@ for v in voicelist:
43
  # v = voice.lower()
44
  # # return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=7, embedding_scale=1))
45
  # return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=multispeakersteps, embedding_scale=1))
46
- if not torch.cuda.is_available(): INTROTXT += "\n\n### You are on a CPU-only system, inference will be much slower.\n\nYou can use the [online demo](https://huggingface.co/spaces/styletts2/styletts2) for fast inference."
47
  def synthesize(text, voice, lngsteps, password, progress=gr.Progress()):
48
  if text.strip() == "":
49
  raise gr.Error("You must enter some text")
50
  if len(text) > 50000:
51
  raise gr.Error("Text must be <50k characters")
52
- print("*** saying ***")
53
- print(text)
54
- print("*** end ***")
55
- texts = txtsplit(text)
56
  v = voice.lower()
57
  audios = []
58
  for t in progress.tqdm(texts):
59
- print(t)
60
  audios.append(styletts2importable.inference(t, voices[v], alpha=0.3, beta=0.7, diffusion_steps=lngsteps, embedding_scale=1))
61
  return (24000, np.concatenate(audios))
62
  # def longsynthesize(text, voice, lngsteps, password, progress=gr.Progress()):
@@ -75,7 +70,7 @@ def synthesize(text, voice, lngsteps, password, progress=gr.Progress()):
75
  # return (24000, np.concatenate(audios))
76
  # else:
77
  # raise gr.Error('Wrong access code')
78
- def clsynthesize(text, voice, vcsteps, embscale, alpha, beta, progress=gr.Progress()):
79
  # if text.strip() == "":
80
  # raise gr.Error("You must enter some text")
81
  # # if global_phonemizer.phonemize([text]) > 300:
@@ -87,19 +82,10 @@ def clsynthesize(text, voice, vcsteps, embscale, alpha, beta, progress=gr.Progre
87
  raise gr.Error("You must enter some text")
88
  if len(text) > 50000:
89
  raise gr.Error("Text must be <50k characters")
90
- if embscale > 1.3 and len(text) < 20:
91
- gr.Warning("WARNING: You entered short text, you may get static!")
92
- print("*** saying ***")
93
- print(text)
94
- print("*** end ***")
95
- texts = txtsplit(text)
96
  audios = []
97
- # vs = styletts2importable.compute_style(voice)
98
- vs = styletts2importable.compute_style(voice)
99
- # print(vs)
100
  for t in progress.tqdm(texts):
101
- audios.append(styletts2importable.inference(t, vs, alpha=alpha, beta=beta, diffusion_steps=vcsteps, embedding_scale=embscale))
102
- # audios.append(styletts2importable.inference(t, vs, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=5))
103
  return (24000, np.concatenate(audios))
104
  def ljsynthesize(text, steps, progress=gr.Progress()):
105
  # if text.strip() == "":
@@ -113,17 +99,14 @@ def ljsynthesize(text, steps, progress=gr.Progress()):
113
  raise gr.Error("You must enter some text")
114
  if len(text) > 150000:
115
  raise gr.Error("Text must be <150k characters")
116
- print("*** saying ***")
117
- print(text)
118
- print("*** end ***")
119
- texts = txtsplit(text)
120
  audios = []
121
  for t in progress.tqdm(texts):
122
  audios.append(ljspeechimportable.inference(t, noise, diffusion_steps=steps, embedding_scale=1))
123
  return (24000, np.concatenate(audios))
124
 
125
 
126
- with gr.Blocks() as vctk:
127
  with gr.Row():
128
  with gr.Column(scale=1):
129
  inp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
@@ -132,21 +115,18 @@ with gr.Blocks() as vctk:
132
  # use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
133
  with gr.Column(scale=1):
134
  btn = gr.Button("Synthesize", variant="primary")
135
- audio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
136
  btn.click(synthesize, inputs=[inp, voice, multispeakersteps], outputs=[audio], concurrency_limit=4)
137
  with gr.Blocks() as clone:
138
  with gr.Row():
139
  with gr.Column(scale=1):
140
  clinp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
141
- clvoice = gr.Audio(label="Voice", interactive=True, type='filepath', max_length=300, waveform_options={'waveform_progress_color': '#3C82F6'})
142
  vcsteps = gr.Slider(minimum=3, maximum=20, value=20, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
143
- embscale = gr.Slider(minimum=1, maximum=10, value=1, step=0.1, label="Embedding Scale (READ WARNING BELOW)", info="Defaults to 1. WARNING: If you set this too high and generate text that's too short you will get static!", interactive=True)
144
- alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha", info="Defaults to 0.3", interactive=True)
145
- beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1, label="Beta", info="Defaults to 0.7", interactive=True)
146
  with gr.Column(scale=1):
147
  clbtn = gr.Button("Synthesize", variant="primary")
148
- claudio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
149
- clbtn.click(clsynthesize, inputs=[clinp, clvoice, vcsteps, embscale, alpha, beta], outputs=[claudio], concurrency_limit=4)
150
  # with gr.Blocks() as longText:
151
  # with gr.Row():
152
  # with gr.Column(scale=1):
@@ -165,10 +145,22 @@ with gr.Blocks() as lj:
165
  ljsteps = gr.Slider(minimum=3, maximum=20, value=3, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
166
  with gr.Column(scale=1):
167
  ljbtn = gr.Button("Synthesize", variant="primary")
168
- ljaudio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
169
  ljbtn.click(ljsynthesize, inputs=[ljinp, ljsteps], outputs=[ljaudio], concurrency_limit=4)
170
  with gr.Blocks(title="StyleTTS 2", css="footer{display:none !important}", theme=theme) as demo:
171
- gr.Markdown(INTROTXT)
 
 
 
 
 
 
 
 
 
 
 
 
172
  gr.DuplicateButton("Duplicate Space")
173
  # gr.TabbedInterface([vctk, clone, lj, longText], ['Multi-Voice', 'Voice Cloning', 'LJSpeech', 'Long Text [Beta]'])
174
  gr.TabbedInterface([vctk, clone, lj], ['Multi-Voice', 'Voice Cloning', 'LJSpeech', 'Long Text [Beta]'])
 
1
+ # # # # # # # # # # # # # # # # # # # # # # # # # # # #
2
+ # #
3
+ # StyleTTS 2 Demo #
4
+ # #
5
+ # #
6
+ # Copyright (c) 2023 mrfakename. All rights reserved. #
7
+ # #
8
+ # License : AGPL v3 #
9
+ # Version : 2.0 #
10
+ # Support : https://github.com/neuralvox/styletts2 #
11
+ # #
12
+ # # # # # # # # # # # # # # # # # # # # # # # # # # # #
13
 
 
14
 
 
 
 
 
 
 
 
 
 
 
15
  import gradio as gr
16
  import styletts2importable
17
  import ljspeechimportable
18
  import torch
19
  import os
20
+ from tortoise.utils.text import split_and_recombine_text
21
  import numpy as np
22
  import pickle
23
  theme = gr.themes.Base(
 
43
  # v = voice.lower()
44
  # # return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=7, embedding_scale=1))
45
  # return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=multispeakersteps, embedding_scale=1))
 
46
  def synthesize(text, voice, lngsteps, password, progress=gr.Progress()):
47
  if text.strip() == "":
48
  raise gr.Error("You must enter some text")
49
  if len(text) > 50000:
50
  raise gr.Error("Text must be <50k characters")
51
+ texts = split_and_recombine_text(text)
 
 
 
52
  v = voice.lower()
53
  audios = []
54
  for t in progress.tqdm(texts):
 
55
  audios.append(styletts2importable.inference(t, voices[v], alpha=0.3, beta=0.7, diffusion_steps=lngsteps, embedding_scale=1))
56
  return (24000, np.concatenate(audios))
57
  # def longsynthesize(text, voice, lngsteps, password, progress=gr.Progress()):
 
70
  # return (24000, np.concatenate(audios))
71
  # else:
72
  # raise gr.Error('Wrong access code')
73
+ def clsynthesize(text, voice, vcsteps, progress=gr.Progress()):
74
  # if text.strip() == "":
75
  # raise gr.Error("You must enter some text")
76
  # # if global_phonemizer.phonemize([text]) > 300:
 
82
  raise gr.Error("You must enter some text")
83
  if len(text) > 50000:
84
  raise gr.Error("Text must be <50k characters")
85
+ texts = split_and_recombine_text(text)
 
 
 
 
 
86
  audios = []
 
 
 
87
  for t in progress.tqdm(texts):
88
+ audios.append(styletts2importable.inference(t, styletts2importable.compute_style(voice), alpha=0.3, beta=0.7, diffusion_steps=vcsteps, embedding_scale=1))
 
89
  return (24000, np.concatenate(audios))
90
  def ljsynthesize(text, steps, progress=gr.Progress()):
91
  # if text.strip() == "":
 
99
  raise gr.Error("You must enter some text")
100
  if len(text) > 150000:
101
  raise gr.Error("Text must be <150k characters")
102
+ texts = split_and_recombine_text(text)
 
 
 
103
  audios = []
104
  for t in progress.tqdm(texts):
105
  audios.append(ljspeechimportable.inference(t, noise, diffusion_steps=steps, embedding_scale=1))
106
  return (24000, np.concatenate(audios))
107
 
108
 
109
+ with gr.Blocks() as vctk: # just realized it isn't vctk but libritts but i'm too lazy to change it rn
110
  with gr.Row():
111
  with gr.Column(scale=1):
112
  inp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
 
115
  # use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
116
  with gr.Column(scale=1):
117
  btn = gr.Button("Synthesize", variant="primary")
118
+ audio = gr.Audio(interactive=False, label="Synthesized Audio")
119
  btn.click(synthesize, inputs=[inp, voice, multispeakersteps], outputs=[audio], concurrency_limit=4)
120
  with gr.Blocks() as clone:
121
  with gr.Row():
122
  with gr.Column(scale=1):
123
  clinp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
124
+ clvoice = gr.Audio(label="Voice", interactive=True, type='filepath', max_length=300)
125
  vcsteps = gr.Slider(minimum=3, maximum=20, value=20, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
 
 
 
126
  with gr.Column(scale=1):
127
  clbtn = gr.Button("Synthesize", variant="primary")
128
+ claudio = gr.Audio(interactive=False, label="Synthesized Audio")
129
+ clbtn.click(clsynthesize, inputs=[clinp, clvoice, vcsteps], outputs=[claudio], concurrency_limit=4)
130
  # with gr.Blocks() as longText:
131
  # with gr.Row():
132
  # with gr.Column(scale=1):
 
145
  ljsteps = gr.Slider(minimum=3, maximum=20, value=3, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
146
  with gr.Column(scale=1):
147
  ljbtn = gr.Button("Synthesize", variant="primary")
148
+ ljaudio = gr.Audio(interactive=False, label="Synthesized Audio")
149
  ljbtn.click(ljsynthesize, inputs=[ljinp, ljsteps], outputs=[ljaudio], concurrency_limit=4)
150
  with gr.Blocks(title="StyleTTS 2", css="footer{display:none !important}", theme=theme) as demo:
151
+ gr.Markdown("""# StyleTTS 2
152
+
153
+ [Paper](https://arxiv.org/abs/2306.07691) - [Samples](https://styletts2.github.io/) - [Code](https://github.com/yl4579/StyleTTS2)
154
+
155
+ A free demo of StyleTTS 2. **I am not affiliated with the StyleTTS 2 Authors.**
156
+
157
+ #### Help this space get to the top of HF's trending list! Please give this space a Like!
158
+
159
+ **Before using this demo, you agree to inform the listeners that the speech samples are synthesized by the pre-trained models, unless you have the permission to use the voice you synthesize. That is, you agree to only use voices whose speakers grant the permission to have their voice cloned, either directly or by license before making synthesized voices public, or you have to publicly announce that these voices are synthesized if you do not have the permission to use these voices.**
160
+
161
+ Is there a long queue on this space? Duplicate it and add a more powerful GPU to skip the wait! **Note: Thank you to Hugging Face for their generous GPU grant program!**
162
+
163
+ **NOTE: StyleTTS 2 does better on longer texts.** For example, making it say "hi" will produce a lower-quality result than making it say a longer phrase.""")
164
  gr.DuplicateButton("Duplicate Space")
165
  # gr.TabbedInterface([vctk, clone, lj, longText], ['Multi-Voice', 'Voice Cloning', 'LJSpeech', 'Long Text [Beta]'])
166
  gr.TabbedInterface([vctk, clone, lj], ['Multi-Voice', 'Voice Cloning', 'LJSpeech', 'Long Text [Beta]'])
requirements.txt CHANGED
@@ -1,24 +1,23 @@
1
- SoundFile
2
- torchaudio
3
- munch
4
- torch
5
- pydub
6
- pyyaml
7
- librosa
8
- nltk
9
- matplotlib
10
- accelerate
11
- transformers
12
- einops
13
- einops-exts
14
- tqdm
15
- typing
16
- typing-extensions
17
- git+https://github.com/resemble-ai/monotonic_align.git
18
- scipy
19
- phonemizer
20
- cached-path
21
- gradio
22
- gruut
23
- #tortoise-tts
24
- txtsplit
 
1
+ SoundFile
2
+ torchaudio
3
+ munch
4
+ torch
5
+ pydub
6
+ pyyaml
7
+ librosa
8
+ nltk
9
+ matplotlib
10
+ accelerate
11
+ transformers
12
+ einops
13
+ einops-exts
14
+ tqdm
15
+ typing
16
+ typing-extensions
17
+ git+https://github.com/resemble-ai/monotonic_align.git
18
+ scipy
19
+ phonemizer
20
+ cached-path
21
+ gradio
22
+ gruut
23
+ tortoise-tts