Respair commited on
Commit
00bdfef
โ€ข
1 Parent(s): 97dc1c6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -27
app.py CHANGED
@@ -1,28 +1,18 @@
1
  INTROTXT = """# StyleTTS 2
2
  kudos to mrfakename for the base gradio code I'm borrowing here.
3
-
4
-
5
  ๆ—ฅๆœฌ่ชž็”จ
6
-
7
  You will probably experience slight artifacts at the beginning or at the end of the output, which is not there on my server.
8
-
9
  Unfortunately, due to the variation in how floating-point operations are performed across different devices,
10
  and given the intrinsic characteristics of models that incorporate diffusion components,
11
  it is unlikely that you will achieve identical results to those obtained on my server, where the model was originally trained.
12
  So, the output you're about to hear may not accurately reflect the true performance of the model.
13
  it is also not limited to the artifacts, even the prosody and natural-ness of the speech is affected.
14
-
15
-
16
-
17
  =========
18
-
19
  ้Ÿณๅฃฐใฎ้–‹ๅง‹ๆ™‚ใพใŸใฏ็ต‚ไบ†ๆ™‚ใซใ€ใ‚‚ใจใ‚‚ใจๅญ˜ๅœจใ—ใชใ‹ใฃใŸใฏใšใฎใ‚ขใƒผใƒ†ใ‚ฃใƒ•ใ‚กใ‚ฏใƒˆใŒใ€ใ“ใ“ใง็™บ็”Ÿใ™ใ‚‹ๅฏ่ƒฝๆ€งใŒใ‚ใ‚Šใพใ™ใ€‚
20
-
21
  ๆฎ‹ๅฟตใชใŒใ‚‰ใ€็•ฐใชใ‚‹ใƒ‡ใƒใ‚คใ‚นใงๆตฎๅ‹•ๅฐๆ•ฐ็‚นๆผ”็ฎ—ใŒ็•ฐใชใ‚‹ๆ–นๆณ•ใง่กŒใ‚ใ‚Œใ‚‹ใŸใ‚ใ€ใŠใ‚ˆใณDiffusionใ‚ณใƒณใƒใƒผใƒใƒณใƒˆใ‚’ๅ–ใ‚Šๅ…ฅใ‚ŒใŸใƒขใƒ‡ใƒซใฎๅ›บๆœ‰ใฎ็‰นๆ€งใ‚’่€ƒๆ…ฎใ™ใ‚‹ใจใ€
22
  ใƒขใƒ‡ใƒซใŒๅ…ƒใ€…ใƒˆใƒฌใƒผใƒ‹ใƒณใ‚ฐใ•ใ‚ŒใŸใƒ‡ใƒใ‚คใ‚นใงๅพ—ใ‚‰ใ‚ŒใŸ็ตๆžœใจๅŒใ˜็ตๆžœใ‚’ๅพ—ใ‚‹ใ“ใจใฏ้›ฃใ—ใ„ใงใ—ใ‚‡ใ†ใ€‚
23
  ใใฎ็ตๆžœใ€ไปฅไธ‹ใงไฝ“้จ“ใ™ใ‚‹ใƒ‘ใƒ•ใ‚ฉใƒผใƒžใƒณใ‚นใฏใƒขใƒ‡ใƒซใฎ็œŸใฎๆ€ง่ƒฝใ‚’ๆญฃ็ขบใซๅๆ˜ ใ—ใฆใ„ใพใ›ใ‚“ใ€‚
24
  ใใฎใŸใ‚ใ€ใ‚ขใƒผใƒ†ใ‚ฃใƒ•ใ‚กใ‚ฏใƒˆใฎๅ•้กŒใ ใ‘ใงใฏใชใใ€ใƒŠใƒใƒฅใƒฉใƒซใƒใ‚นใ‚„้Ÿณๅฃฐใ‚ฏใ‚ชใƒชใƒ†ใ‚ฃใƒผใซใ‚‚ๅŠใณใพใ™ใ€‚
25
-
26
  **
27
  """
28
  import gradio as gr
@@ -59,7 +49,7 @@ for v in voicelist:
59
  # # return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=7, embedding_scale=1))
60
  # return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=multispeakersteps, embedding_scale=1))
61
  if not torch.cuda.is_available(): INTROTXT += "\n\n### on CPU, it'll run rather slower, but not too much."
62
- def synthesize(text, voice,embscale,alpha,beta, lngsteps, progress=gr.Progress()):
63
  if text.strip() == "":
64
  raise gr.Error("You must enter some text")
65
  if len(text) > 50000:
@@ -72,7 +62,7 @@ def synthesize(text, voice,embscale,alpha,beta, lngsteps, progress=gr.Progress()
72
  audios = []
73
  for t in progress.tqdm(texts):
74
  print(t)
75
- audios.append(styletts2importable.inference(t, voices[v], alpha=alpha, beta=beta, diffusion_steps=lngsteps, embedding_scale=embscale))
76
  return (24000, np.concatenate(audios))
77
  # def longsynthesize(text, voice, lngsteps, password, progress=gr.Progress()):
78
  # if password == os.environ['ACCESS_CODE']:
@@ -108,14 +98,18 @@ def clsynthesize(text, voice, vcsteps, embscale, alpha, beta, progress=gr.Progre
108
  print(text)
109
  print("*** end ***")
110
  texts = txtsplit(text)
 
111
  audios = []
112
  # vs = styletts2importable.compute_style(voice)
113
- vs = styletts2importable.compute_style(voice)
114
  # print(vs)
115
  for t in progress.tqdm(texts):
116
- audios.append(styletts2importable.inference(t, vs, alpha=alpha, beta=beta, diffusion_steps=vcsteps, embedding_scale=embscale))
117
  # audios.append(styletts2importable.inference(t, vs, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=5))
118
  return (24000, np.concatenate(audios))
 
 
 
119
  def ljsynthesize(text, steps,embscale, progress=gr.Progress()):
120
  # if text.strip() == "":
121
  # raise gr.Error("You must enter some text")
@@ -141,23 +135,22 @@ def ljsynthesize(text, steps,embscale, progress=gr.Progress()):
141
  with gr.Blocks() as vctk:
142
  with gr.Row():
143
  with gr.Column(scale=1):
144
- inp = gr.Textbox(label="Text", info="Enter the text | ใƒ†ใ‚ญใ‚นใƒˆใ‚’ๅ…ฅใ‚Œใฆใใ ใ•ใ„ใ€็Ÿญใ™ใŽใ‚‹ใจใฒใฉใใชใ‚Šใพใ™", interactive=True)
145
- voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-2', interactive=True)
146
- embscale = gr.Slider(minimum=1, maximum=10, value=1.5, step=0.1, label="Embedding Scale (READ WARNING BELOW)", info="ใ“ใ‚Œใ‚’ไธŠใ’ใŸใ‚‰ใ‚‚ใฃใจใ‚จใƒขใƒผใ‚ทใƒงใƒŠใƒซใช้Ÿณๅฃฐใซใชใ‚Šใพใ™๏ผˆไธ‹ใ’ใŸใ‚‰ใใฎ้€†๏ผ‰ใ€ๅข—ใ‚„ใ—ใ™ใŽใ‚‹ใจใ ใ‚ใซใชใ‚‹ใฎใงใ€ใ”ๆณจๆ„ใใ ใ•ใ„", interactive=True)
147
- multispeakersteps = gr.Slider(minimum=3, maximum=15, value=5, step=1, label="Diffusion Steps", info="ใ“ใ‚Œใ‚’ๅข—ใˆใŸใ‚‰ใ‚ฏใ‚ชใƒชใƒ†ใ‚ฃใƒผใฎใ„ใ„็ตๆžœใซใชใ‚‹ใจใฏ้™ใ‚‰ใชใ„ใ€‚", interactive=True)
148
- alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha", info="Defaults to 0.3", interactive=True)
149
- beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1, label="Beta", info="Defaults to 0.7", interactive=True)
150
- # use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
151
  with gr.Column(scale=1):
152
- btn = gr.Button("Synthesize", variant="primary")
153
- audio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
154
- btn.click(synthesize, inputs=[inp, voice,embscale, multispeakersteps,alpha,beta], outputs=[audio], concurrency_limit=4)
155
  with gr.Blocks() as clone:
156
  with gr.Row():
157
  with gr.Column(scale=1):
158
  clinp = gr.Textbox(label="Text", info="Enter the text | ใƒ†ใ‚ญใ‚นใƒˆใ‚’ๅ…ฅใ‚Œใฆใใ ใ•ใ„ใ€็Ÿญใ™ใŽใ‚‹ใจใฒใฉใใชใ‚Šใพใ™", interactive=True)
159
  clvoice = gr.Audio(label="Voice", interactive=True, type='filepath', max_length=300, waveform_options={'waveform_progress_color': '#3C82F6'})
160
- vcsteps = gr.Slider(minimum=3, maximum=20, value=20, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
161
  embscale = gr.Slider(minimum=1, maximum=10, value=1, step=0.1, label="Embedding Scale (READ WARNING BELOW)", info="Defaults to 1. WARNING: If you set this too high and generate text that's too short you will get static!", interactive=True)
162
  alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha", info="Defaults to 0.3", interactive=True)
163
  beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1, label="Beta", info="Defaults to 0.7", interactive=True)
@@ -196,5 +189,4 @@ the base code was borrowed from -> [mrfakename](https://twitter.com/realmrfakena
196
  """) # Please do not remove this line.
197
  if __name__ == "__main__":
198
  # demo.queue(api_open=False, max_size=15).launch(show_api=False)
199
- demo.queue(api_open=False, max_size=15).launch(show_api=False)
200
-
 
1
  INTROTXT = """# StyleTTS 2
2
  kudos to mrfakename for the base gradio code I'm borrowing here.
 
 
3
  ๆ—ฅๆœฌ่ชž็”จ
 
4
  You will probably experience slight artifacts at the beginning or at the end of the output, which is not there on my server.
 
5
  Unfortunately, due to the variation in how floating-point operations are performed across different devices,
6
  and given the intrinsic characteristics of models that incorporate diffusion components,
7
  it is unlikely that you will achieve identical results to those obtained on my server, where the model was originally trained.
8
  So, the output you're about to hear may not accurately reflect the true performance of the model.
9
  it is also not limited to the artifacts, even the prosody and natural-ness of the speech is affected.
 
 
 
10
  =========
 
11
  ้Ÿณๅฃฐใฎ้–‹ๅง‹ๆ™‚ใพใŸใฏ็ต‚ไบ†ๆ™‚ใซใ€ใ‚‚ใจใ‚‚ใจๅญ˜ๅœจใ—ใชใ‹ใฃใŸใฏใšใฎใ‚ขใƒผใƒ†ใ‚ฃใƒ•ใ‚กใ‚ฏใƒˆใŒใ€ใ“ใ“ใง็™บ็”Ÿใ™ใ‚‹ๅฏ่ƒฝๆ€งใŒใ‚ใ‚Šใพใ™ใ€‚
 
12
  ๆฎ‹ๅฟตใชใŒใ‚‰ใ€็•ฐใชใ‚‹ใƒ‡ใƒใ‚คใ‚นใงๆตฎๅ‹•ๅฐๆ•ฐ็‚นๆผ”็ฎ—ใŒ็•ฐใชใ‚‹ๆ–นๆณ•ใง่กŒใ‚ใ‚Œใ‚‹ใŸใ‚ใ€ใŠใ‚ˆใณDiffusionใ‚ณใƒณใƒใƒผใƒใƒณใƒˆใ‚’ๅ–ใ‚Šๅ…ฅใ‚ŒใŸใƒขใƒ‡ใƒซใฎๅ›บๆœ‰ใฎ็‰นๆ€งใ‚’่€ƒๆ…ฎใ™ใ‚‹ใจใ€
13
  ใƒขใƒ‡ใƒซใŒๅ…ƒใ€…ใƒˆใƒฌใƒผใƒ‹ใƒณใ‚ฐใ•ใ‚ŒใŸใƒ‡ใƒใ‚คใ‚นใงๅพ—ใ‚‰ใ‚ŒใŸ็ตๆžœใจๅŒใ˜็ตๆžœใ‚’ๅพ—ใ‚‹ใ“ใจใฏ้›ฃใ—ใ„ใงใ—ใ‚‡ใ†ใ€‚
14
  ใใฎ็ตๆžœใ€ไปฅไธ‹ใงไฝ“้จ“ใ™ใ‚‹ใƒ‘ใƒ•ใ‚ฉใƒผใƒžใƒณใ‚นใฏใƒขใƒ‡ใƒซใฎ็œŸใฎๆ€ง่ƒฝใ‚’ๆญฃ็ขบใซๅๆ˜ ใ—ใฆใ„ใพใ›ใ‚“ใ€‚
15
  ใใฎใŸใ‚ใ€ใ‚ขใƒผใƒ†ใ‚ฃใƒ•ใ‚กใ‚ฏใƒˆใฎๅ•้กŒใ ใ‘ใงใฏใชใใ€ใƒŠใƒใƒฅใƒฉใƒซใƒใ‚นใ‚„้Ÿณๅฃฐใ‚ฏใ‚ชใƒชใƒ†ใ‚ฃใƒผใซใ‚‚ๅŠใณใพใ™ใ€‚
 
16
  **
17
  """
18
  import gradio as gr
 
49
  # # return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=7, embedding_scale=1))
50
  # return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=multispeakersteps, embedding_scale=1))
51
  if not torch.cuda.is_available(): INTROTXT += "\n\n### on CPU, it'll run rather slower, but not too much."
52
+ def synthesize(text, voice, lngsteps, password, progress=gr.Progress()):
53
  if text.strip() == "":
54
  raise gr.Error("You must enter some text")
55
  if len(text) > 50000:
 
62
  audios = []
63
  for t in progress.tqdm(texts):
64
  print(t)
65
+ audios.append(styletts2importable.inference(t, voices[v], alpha=0.3, beta=0.7, diffusion_steps=lngsteps, embedding_scale=1))
66
  return (24000, np.concatenate(audios))
67
  # def longsynthesize(text, voice, lngsteps, password, progress=gr.Progress()):
68
  # if password == os.environ['ACCESS_CODE']:
 
98
  print(text)
99
  print("*** end ***")
100
  texts = txtsplit(text)
101
+
102
  audios = []
103
  # vs = styletts2importable.compute_style(voice)
104
+
105
  # print(vs)
106
  for t in progress.tqdm(texts):
107
+ audios.append(styletts2importable.inference(t, voices[v], alpha=alpha, beta=beta, diffusion_steps=vcsteps, embedding_scale=embscale))
108
  # audios.append(styletts2importable.inference(t, vs, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=5))
109
  return (24000, np.concatenate(audios))
110
+
111
+
112
+
113
  def ljsynthesize(text, steps,embscale, progress=gr.Progress()):
114
  # if text.strip() == "":
115
  # raise gr.Error("You must enter some text")
 
135
  with gr.Blocks() as vctk:
136
  with gr.Row():
137
  with gr.Column(scale=1):
138
+ clinp = gr.Textbox(label="Text", info="Enter the text | ใƒ†ใ‚ญใ‚นใƒˆใ‚’ๅ…ฅใ‚Œใฆใใ ใ•ใ„ใ€็Ÿญใ™ใŽใ‚‹ใจใฒใฉใใชใ‚Šใพใ™",value="ใ‚ใชใŸใŒใ„ใชใ„ใจใ€ไธ–็•Œใฏ่‰ฒ่คชใ›ใฆ่ฆ‹ใˆใพใ™ใ€‚ใ‚ใชใŸใฎ็ฌ‘้ก”ใŒ็งใฎๆ—ฅใ€…ใ‚’ๆ˜Žใ‚‹ใ็…งใ‚‰ใ—ใฆใ„ใพใ™ใ€‚ใ‚ใชใŸใŒใ„ใชใ„ๆ—ฅใฏใ€ใพใ‚‹ใงๅ†ฌใฎใ‚ˆใ†ใซๅฏ’ใใ€ๆš—ใ„ใงใ™." interactive=True)
139
+ voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", interactive=True)
140
+ vcsteps = gr.Slider(minimum=3, maximum=20, value=5, step=1, label="Diffusion Steps", info="You'll get more variation in the results if you increase it, doesn't necessarily improve anything.| ใ“ใ‚Œใ‚’ไธŠใ’ใŸใ‚‰ใ‚‚ใฃใจใ‚จใƒขใƒผใ‚ทใƒงใƒŠใƒซใช้Ÿณๅฃฐใซใชใ‚Šใพใ™๏ผˆไธ‹ใ’ใŸใ‚‰ใใฎ้€†๏ผ‰ใ€ๅข—ใ‚„ใ—ใ™ใŽใ‚‹ใจใ ใ‚ใซใชใ‚‹ใฎใงใ€ใ”ๆณจๆ„ใใ ใ•ใ„", interactive=True)
141
+ embscale = gr.Slider(minimum=1, maximum=10, value=1.8, step=0.1, label="Embedding Scale (READ WARNING BELOW)", info="Defaults to 1. WARNING: If you set this too high and generate text that's too short you will get static!", interactive=True)
142
+ alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha", interactive=True)
143
+ beta = gr.Slider(minimum=0, maximum=1, value=0.4, step=0.1, label="Beta", interactive=True)
 
144
  with gr.Column(scale=1):
145
+ clbtn = gr.Button("Synthesize", variant="primary")
146
+ claudio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
147
+ clbtn.click(clsynthesize, inputs=[clinp, voice, vcsteps, embscale, alpha, beta], outputs=[claudio], concurrency_limit=4)
148
  with gr.Blocks() as clone:
149
  with gr.Row():
150
  with gr.Column(scale=1):
151
  clinp = gr.Textbox(label="Text", info="Enter the text | ใƒ†ใ‚ญใ‚นใƒˆใ‚’ๅ…ฅใ‚Œใฆใใ ใ•ใ„ใ€็Ÿญใ™ใŽใ‚‹ใจใฒใฉใใชใ‚Šใพใ™", interactive=True)
152
  clvoice = gr.Audio(label="Voice", interactive=True, type='filepath', max_length=300, waveform_options={'waveform_progress_color': '#3C82F6'})
153
+ vcsteps = gr.Slider(minimum=3, maximum=10, value=2, step=1, label="Diffusion Steps", info="ใ“ใ‚Œใ‚’ไธŠใ’ใŸใ‚‰ใ‚‚ใฃใจใ‚จใƒขใƒผใ‚ทใƒงใƒŠใƒซใช้Ÿณๅฃฐใซใชใ‚Šใพใ™๏ผˆไธ‹ใ’ใŸใ‚‰ใใฎ้€†๏ผ‰ใ€ๅข—ใ‚„ใ—ใ™ใŽใ‚‹ใจใ ใ‚ใซใชใ‚‹ใฎใงใ€ใ”ๆณจๆ„ใใ ใ•ใ„", interactive=True)
154
  embscale = gr.Slider(minimum=1, maximum=10, value=1, step=0.1, label="Embedding Scale (READ WARNING BELOW)", info="Defaults to 1. WARNING: If you set this too high and generate text that's too short you will get static!", interactive=True)
155
  alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha", info="Defaults to 0.3", interactive=True)
156
  beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1, label="Beta", info="Defaults to 0.7", interactive=True)
 
189
  """) # Please do not remove this line.
190
  if __name__ == "__main__":
191
  # demo.queue(api_open=False, max_size=15).launch(show_api=False)
192
+ demo.queue(api_open=False, max_size=15).launch(show_api=False,share=True)