Respair commited on
Commit
97dc1c6
โ€ข
1 Parent(s): fb780a9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -8
app.py CHANGED
@@ -59,7 +59,7 @@ for v in voicelist:
59
  # # return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=7, embedding_scale=1))
60
  # return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=multispeakersteps, embedding_scale=1))
61
  if not torch.cuda.is_available(): INTROTXT += "\n\n### on CPU, it'll run rather slower, but not too much."
62
- def synthesize(text, voice, lngsteps, password, progress=gr.Progress()):
63
  if text.strip() == "":
64
  raise gr.Error("You must enter some text")
65
  if len(text) > 50000:
@@ -72,7 +72,7 @@ def synthesize(text, voice, lngsteps, password, progress=gr.Progress()):
72
  audios = []
73
  for t in progress.tqdm(texts):
74
  print(t)
75
- audios.append(styletts2importable.inference(t, voices[v], alpha=0.3, beta=0.7, diffusion_steps=lngsteps, embedding_scale=1))
76
  return (24000, np.concatenate(audios))
77
  # def longsynthesize(text, voice, lngsteps, password, progress=gr.Progress()):
78
  # if password == os.environ['ACCESS_CODE']:
@@ -141,20 +141,21 @@ def ljsynthesize(text, steps,embscale, progress=gr.Progress()):
141
  with gr.Blocks() as vctk:
142
  with gr.Row():
143
  with gr.Column(scale=1):
144
- inp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
145
  voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-2', interactive=True)
146
- multispeakersteps = gr.Slider(minimum=3, maximum=15, value=3, step=1, label="Diffusion Steps", info="ใ“ใ‚Œใ‚’ๅข—ใˆใŸใ‚‰ใ‚‚ใฃใจใ‚จใƒขใƒผใ‚ทใƒงใƒŠใƒซใช็ตๆžœใซใชใ‚Šใพใ™ใŒใ€ใ‚ฏใ‚ชใƒชใƒ†ใ‚ฃใƒผใฎใ„ใ„็ตๆžœใซใชใ‚‹ใจใฏ้™ใ‚‰ใชใ„ใ€‚", interactive=True)
 
147
  alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha", info="Defaults to 0.3", interactive=True)
148
  beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1, label="Beta", info="Defaults to 0.7", interactive=True)
149
  # use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
150
  with gr.Column(scale=1):
151
  btn = gr.Button("Synthesize", variant="primary")
152
  audio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
153
- btn.click(synthesize, inputs=[inp, voice, multispeakersteps,alpha,beta], outputs=[audio], concurrency_limit=4)
154
  with gr.Blocks() as clone:
155
  with gr.Row():
156
  with gr.Column(scale=1):
157
- clinp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
158
  clvoice = gr.Audio(label="Voice", interactive=True, type='filepath', max_length=300, waveform_options={'waveform_progress_color': '#3C82F6'})
159
  vcsteps = gr.Slider(minimum=3, maximum=20, value=20, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
160
  embscale = gr.Slider(minimum=1, maximum=10, value=1, step=0.1, label="Embedding Scale (READ WARNING BELOW)", info="Defaults to 1. WARNING: If you set this too high and generate text that's too short you will get static!", interactive=True)
@@ -179,8 +180,8 @@ with gr.Blocks() as lj:
179
  with gr.Row():
180
  with gr.Column(scale=1):
181
  ljinp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True, value="ใ‚ใชใŸใŒใ„ใชใ„ใจใ€ไธ–็•Œใฏ่‰ฒ่คชใ›ใฆ่ฆ‹ใˆใพใ™ใ€‚ใ‚ใชใŸใฎ็ฌ‘้ก”ใŒ็งใฎๆ—ฅใ€…ใ‚’ๆ˜Žใ‚‹ใ็…งใ‚‰ใ—ใฆใ„ใพใ™ใ€‚ใ‚ใชใŸใŒใ„ใชใ„ๆ—ฅใฏใ€ใพใ‚‹ใงๅ†ฌใฎใ‚ˆใ†ใซๅฏ’ใใ€ๆš—ใ„ใงใ™.")
182
- embscale = gr.Slider(minimum=1, maximum=10, value=1, step=0.1, label="Embedding Scale (READ WARNING BELOW)", info="Defaults to 1. ใ“ใ‚Œใ‚’ไธŠใ’ใŸใ‚‰ใƒ‘ใƒ•ใ‚ฉใƒผใƒžใƒณใ‚นใŒใ‚‚ใฃใจใ‚จใƒขใƒผใ‚ทใƒงใƒŠใƒซใซใชใ‚‹ใ€ๅข—ใ‚„ใ—ใ™ใŽใ‚‹ใจใ ใ‚ใซใชใ‚‹ใฎใงใ€ใ”ๆณจๆ„ใใ ใ•ใ„", interactive=True)
183
- ljsteps = gr.Slider(minimum=3, maximum=20, value=3, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
184
  with gr.Column(scale=1):
185
  ljbtn = gr.Button("Synthesize", variant="primary")
186
  ljaudio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
 
59
  # # return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=7, embedding_scale=1))
60
  # return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=multispeakersteps, embedding_scale=1))
61
  if not torch.cuda.is_available(): INTROTXT += "\n\n### on CPU, it'll run rather slower, but not too much."
62
+ def synthesize(text, voice,embscale,alpha,beta, lngsteps, progress=gr.Progress()):
63
  if text.strip() == "":
64
  raise gr.Error("You must enter some text")
65
  if len(text) > 50000:
 
72
  audios = []
73
  for t in progress.tqdm(texts):
74
  print(t)
75
+ audios.append(styletts2importable.inference(t, voices[v], alpha=alpha, beta=beta, diffusion_steps=lngsteps, embedding_scale=embscale))
76
  return (24000, np.concatenate(audios))
77
  # def longsynthesize(text, voice, lngsteps, password, progress=gr.Progress()):
78
  # if password == os.environ['ACCESS_CODE']:
 
141
  with gr.Blocks() as vctk:
142
  with gr.Row():
143
  with gr.Column(scale=1):
144
+ inp = gr.Textbox(label="Text", info="Enter the text | ใƒ†ใ‚ญใ‚นใƒˆใ‚’ๅ…ฅใ‚Œใฆใใ ใ•ใ„ใ€็Ÿญใ™ใŽใ‚‹ใจใฒใฉใใชใ‚Šใพใ™", interactive=True)
145
  voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-2', interactive=True)
146
+ embscale = gr.Slider(minimum=1, maximum=10, value=1.5, step=0.1, label="Embedding Scale (READ WARNING BELOW)", info="ใ“ใ‚Œใ‚’ไธŠใ’ใŸใ‚‰ใ‚‚ใฃใจใ‚จใƒขใƒผใ‚ทใƒงใƒŠใƒซใช้Ÿณๅฃฐใซใชใ‚Šใพใ™๏ผˆไธ‹ใ’ใŸใ‚‰ใใฎ้€†๏ผ‰ใ€ๅข—ใ‚„ใ—ใ™ใŽใ‚‹ใจใ ใ‚ใซใชใ‚‹ใฎใงใ€ใ”ๆณจๆ„ใใ ใ•ใ„", interactive=True)
147
+ multispeakersteps = gr.Slider(minimum=3, maximum=15, value=5, step=1, label="Diffusion Steps", info="ใ“ใ‚Œใ‚’ๅข—ใˆใŸใ‚‰ใ‚ฏใ‚ชใƒชใƒ†ใ‚ฃใƒผใฎใ„ใ„็ตๆžœใซใชใ‚‹ใจใฏ้™ใ‚‰ใชใ„ใ€‚", interactive=True)
148
  alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha", info="Defaults to 0.3", interactive=True)
149
  beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1, label="Beta", info="Defaults to 0.7", interactive=True)
150
  # use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
151
  with gr.Column(scale=1):
152
  btn = gr.Button("Synthesize", variant="primary")
153
  audio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
154
+ btn.click(synthesize, inputs=[inp, voice,embscale, multispeakersteps,alpha,beta], outputs=[audio], concurrency_limit=4)
155
  with gr.Blocks() as clone:
156
  with gr.Row():
157
  with gr.Column(scale=1):
158
+ clinp = gr.Textbox(label="Text", info="Enter the text | ใƒ†ใ‚ญใ‚นใƒˆใ‚’ๅ…ฅใ‚Œใฆใใ ใ•ใ„ใ€็Ÿญใ™ใŽใ‚‹ใจใฒใฉใใชใ‚Šใพใ™", interactive=True)
159
  clvoice = gr.Audio(label="Voice", interactive=True, type='filepath', max_length=300, waveform_options={'waveform_progress_color': '#3C82F6'})
160
  vcsteps = gr.Slider(minimum=3, maximum=20, value=20, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
161
  embscale = gr.Slider(minimum=1, maximum=10, value=1, step=0.1, label="Embedding Scale (READ WARNING BELOW)", info="Defaults to 1. WARNING: If you set this too high and generate text that's too short you will get static!", interactive=True)
 
180
  with gr.Row():
181
  with gr.Column(scale=1):
182
  ljinp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True, value="ใ‚ใชใŸใŒใ„ใชใ„ใจใ€ไธ–็•Œใฏ่‰ฒ่คชใ›ใฆ่ฆ‹ใˆใพใ™ใ€‚ใ‚ใชใŸใฎ็ฌ‘้ก”ใŒ็งใฎๆ—ฅใ€…ใ‚’ๆ˜Žใ‚‹ใ็…งใ‚‰ใ—ใฆใ„ใพใ™ใ€‚ใ‚ใชใŸใŒใ„ใชใ„ๆ—ฅใฏใ€ใพใ‚‹ใงๅ†ฌใฎใ‚ˆใ†ใซๅฏ’ใใ€ๆš—ใ„ใงใ™.")
183
+ embscale = gr.Slider(minimum=1, maximum=10, value=1.5, step=0.1, label="Embedding Scale (READ WARNING BELOW)", info="ใ“ใ‚Œใ‚’ไธŠใ’ใŸใ‚‰ใ‚‚ใฃใจใ‚จใƒขใƒผใ‚ทใƒงใƒŠใƒซใช้Ÿณๅฃฐใซใชใ‚Šใพใ™๏ผˆไธ‹ใ’ใŸใ‚‰ใใฎ้€†๏ผ‰ใ€ๅข—ใ‚„ใ—ใ™ใŽใ‚‹ใจใ ใ‚ใซใชใ‚‹ใฎใงใ€ใ”ๆณจๆ„ใใ ใ•ใ„", interactive=True)
184
+ ljsteps = gr.Slider(minimum=3, maximum=20, value=5, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
185
  with gr.Column(scale=1):
186
  ljbtn = gr.Button("Synthesize", variant="primary")
187
  ljaudio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})