Respair's picture
Update app.py
fb43b4e verified
raw
history blame
12.6 kB
INTROTXT = """# StyleTTS 2
kudos to mrfakename for the base gradio code I'm borrowing here.
ๆ—ฅๆœฌ่ชž็”จ
You will probably experience slight artifacts at the beginning or at the end of the output, which is not there on my server.
Unfortunately, due to the variation in how floating-point operations are performed across different devices,
and given the intrinsic characteristics of models that incorporate diffusion components,
it is unlikely that you will achieve identical results to those obtained on my server, where the model was originally trained.
So, the output you're about to hear may not accurately reflect the true performance of the model.
=========
ๆฎ‹ๅฟตใชใŒใ‚‰ใ€็•ฐใชใ‚‹ใƒ‡ใƒใ‚คใ‚นใงๆตฎๅ‹•ๅฐๆ•ฐ็‚นๆผ”็ฎ—ใŒ็•ฐใชใ‚‹ๆ–นๆณ•ใง่กŒใ‚ใ‚Œใ‚‹ใŸใ‚ใ€ใŠใ‚ˆใณDiffusionใ‚ณใƒณใƒใƒผใƒใƒณใƒˆใ‚’ๅ–ใ‚Šๅ…ฅใ‚ŒใŸใƒขใƒ‡ใƒซใฎๅ›บๆœ‰ใฎ็‰นๆ€งใ‚’่€ƒๆ…ฎใ™ใ‚‹ใจใ€
ใƒขใƒ‡ใƒซใŒๅ…ƒใ€…ใƒˆใƒฌใƒผใƒ‹ใƒณใ‚ฐใ•ใ‚ŒใŸใƒ‡ใƒใ‚คใ‚นใงๅพ—ใ‚‰ใ‚ŒใŸ็ตๆžœใจๅŒใ˜็ตๆžœใ‚’ๅพ—ใ‚‹ใ“ใจใฏ้›ฃใ—ใ„ใงใ—ใ‚‡ใ†ใ€‚
ใใฎ็ตๆžœใ€ไปฅไธ‹ใงไฝ“้จ“ใ™ใ‚‹ใƒ‘ใƒ•ใ‚ฉใƒผใƒžใƒณใ‚นใฏใƒขใƒ‡ใƒซใฎ็œŸใฎๆ€ง่ƒฝใ‚’ๆญฃ็ขบใซๅๆ˜ ใ—ใฆใ„ใพใ›ใ‚“ใ€‚
ใใฎใŸใ‚ใ€้Ÿณๅฃฐใฎ้–‹ๅง‹ๆ™‚ใพใŸใฏ็ต‚ไบ†ๆ™‚ใซใ‚ขใƒผใƒ†ใ‚ฃใƒ•ใ‚กใ‚ฏใƒˆใŒ็™บ็”Ÿใ™ใ‚‹ๅฏ่ƒฝๆ€งใŒใ‚ใ‚Šใพใ™ใ€‚
**
"""
import gradio as gr
import styletts2importable
import ljspeechimportable
import torch
import os
from txtsplit import txtsplit
import numpy as np
import pickle
theme = gr.themes.Base(
font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'],
)
voicelist = ['VO_JA_Kamisato_Ayaka_About_Kujou_Sara','hontonokimochi','gaen_original']
voices = {}
# import phonemizer
# global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)
# todo: cache computed style, load using pickle
# if os.path.exists('voices.pkl'):
# with open('voices.pkl', 'rb') as f:
# voices = pickle.load(f)
# else:
for v in voicelist:
voices[v] = styletts2importable.compute_style(f'voices/{v}.wav')
# def synthesize(text, voice, multispeakersteps):
# if text.strip() == "":
# raise gr.Error("You must enter some text")
# # if len(global_phonemizer.phonemize([text])) > 300:
# if len(text) > 300:
# raise gr.Error("Text must be under 300 characters")
# v = voice.lower()
# # return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=7, embedding_scale=1))
# return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=multispeakersteps, embedding_scale=1))
if not torch.cuda.is_available(): INTROTXT += "\n\n### on CPU, it'll run rather slower, but not too much."
def synthesize(text, voice, lngsteps, password, progress=gr.Progress()):
if text.strip() == "":
raise gr.Error("You must enter some text")
if len(text) > 50000:
raise gr.Error("Text must be <50k characters")
print("*** saying ***")
print(text)
print("*** end ***")
texts = txtsplit(text)
v = voice.lower()
audios = []
for t in progress.tqdm(texts):
print(t)
audios.append(styletts2importable.inference(t, voices[v], alpha=0.3, beta=0.7, diffusion_steps=lngsteps, embedding_scale=1))
return (24000, np.concatenate(audios))
# def longsynthesize(text, voice, lngsteps, password, progress=gr.Progress()):
# if password == os.environ['ACCESS_CODE']:
# if text.strip() == "":
# raise gr.Error("You must enter some text")
# if lngsteps > 25:
# raise gr.Error("Max 25 steps")
# if lngsteps < 5:
# raise gr.Error("Min 5 steps")
# texts = split_and_recombine_text(text)
# v = voice.lower()
# audios = []
# for t in progress.tqdm(texts):
# audios.append(styletts2importable.inference(t, voices[v], alpha=0.3, beta=0.7, diffusion_steps=lngsteps, embedding_scale=1))
# return (24000, np.concatenate(audios))
# else:
# raise gr.Error('Wrong access code')
def clsynthesize(text, voice, vcsteps, embscale, alpha, beta, progress=gr.Progress()):
# if text.strip() == "":
# raise gr.Error("You must enter some text")
# # if global_phonemizer.phonemize([text]) > 300:
# if len(text) > 400:
# raise gr.Error("Text must be under 400 characters")
# # return (24000, styletts2importable.inference(text, styletts2importable.compute_style(voice), alpha=0.3, beta=0.7, diffusion_steps=20, embedding_scale=1))
# return (24000, styletts2importable.inference(text, styletts2importable.compute_style(voice), alpha=0.3, beta=0.7, diffusion_steps=vcsteps, embedding_scale=1))
if text.strip() == "":
raise gr.Error("You must enter some text")
if len(text) > 50000:
raise gr.Error("Text must be <50k characters")
if embscale > 1.3 and len(text) < 20:
gr.Warning("WARNING: You entered short text, you may get static!")
print("*** saying ***")
print(text)
print("*** end ***")
texts = txtsplit(text)
audios = []
# vs = styletts2importable.compute_style(voice)
vs = styletts2importable.compute_style(voice)
# print(vs)
for t in progress.tqdm(texts):
audios.append(styletts2importable.inference(t, vs, alpha=alpha, beta=beta, diffusion_steps=vcsteps, embedding_scale=embscale))
# audios.append(styletts2importable.inference(t, vs, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=5))
return (24000, np.concatenate(audios))
def ljsynthesize(text, steps,embscale, progress=gr.Progress()):
# if text.strip() == "":
# raise gr.Error("You must enter some text")
# # if global_phonemizer.phonemize([text]) > 300:
# if len(text) > 400:
# raise gr.Error("Text must be under 400 characters")
noise = torch.randn(1,1,256).to('cuda' if torch.cuda.is_available() else 'cpu')
# return (24000, Text-guided Inferenceimportable.inference(text, noise, diffusion_steps=7, embedding_scale=1))
if text.strip() == "":
raise gr.Error("You must enter some text")
if len(text) > 150000:
raise gr.Error("Text must be <150k characters")
print("*** saying ***")
print(text)
print("*** end ***")
texts = txtsplit(text)
audios = []
for t in progress.tqdm(texts):
audios.append(ljspeechimportable.inference(t, noise, diffusion_steps=steps, embedding_scale=embscale))
return (24000, np.concatenate(audios))
with gr.Blocks() as vctk:
with gr.Row():
with gr.Column(scale=1):
inp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-2', interactive=True)
multispeakersteps = gr.Slider(minimum=3, maximum=15, value=3, step=1, label="Diffusion Steps", info="ใ“ใ‚Œใ‚’ๅข—ใˆใŸใ‚‰ใ‚‚ใฃใจใ‚จใƒขใƒผใ‚ทใƒงใƒŠใƒซใช็ตๆžœใซใชใ‚Šใพใ™ใŒใ€ใ‚ฏใ‚ชใƒชใƒ†ใ‚ฃใƒผใฎใ„ใ„็ตๆžœใซใชใ‚‹ใจใฏ้™ใ‚‰ใชใ„ใ€‚", interactive=True)
alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha", info="Defaults to 0.3", interactive=True)
beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1, label="Beta", info="Defaults to 0.7", interactive=True)
# use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
with gr.Column(scale=1):
btn = gr.Button("Synthesize", variant="primary")
audio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
btn.click(synthesize, inputs=[inp, voice, multispeakersteps,alpha,beta], outputs=[audio], concurrency_limit=4)
with gr.Blocks() as clone:
with gr.Row():
with gr.Column(scale=1):
clinp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
clvoice = gr.Audio(label="Voice", interactive=True, type='filepath', max_length=300, waveform_options={'waveform_progress_color': '#3C82F6'})
vcsteps = gr.Slider(minimum=3, maximum=20, value=20, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
embscale = gr.Slider(minimum=1, maximum=10, value=1, step=0.1, label="Embedding Scale (READ WARNING BELOW)", info="Defaults to 1. WARNING: If you set this too high and generate text that's too short you will get static!", interactive=True)
alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha", info="Defaults to 0.3", interactive=True)
beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1, label="Beta", info="Defaults to 0.7", interactive=True)
with gr.Column(scale=1):
clbtn = gr.Button("Synthesize", variant="primary")
claudio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
clbtn.click(clsynthesize, inputs=[clinp, clvoice, vcsteps, embscale, alpha, beta], outputs=[claudio], concurrency_limit=4)
# with gr.Blocks() as longText:
# with gr.Row():
# with gr.Column(scale=1):
# lnginp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
# lngvoice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-1', interactive=True)
# lngsteps = gr.Slider(minimum=5, maximum=25, value=10, step=1, label="Diffusion Steps", info="Higher = better quality, but slower", interactive=True)
# lngpwd = gr.Textbox(label="Access code", info="This feature is in beta. You need an access code to use it as it uses more resources and we would like to prevent abuse")
# with gr.Column(scale=1):
# lngbtn = gr.Button("Synthesize", variant="primary")
# lngaudio = gr.Audio(interactive=False, label="Synthesized Audio")
# lngbtn.click(longsynthesize, inputs=[lnginp, lngvoice, lngsteps, lngpwd], outputs=[lngaudio], concurrency_limit=4)
with gr.Blocks() as lj:
with gr.Row():
with gr.Column(scale=1):
ljinp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True, value="ใ‚ใชใŸใŒใ„ใชใ„ใจใ€ไธ–็•Œใฏ่‰ฒ่คชใ›ใฆ่ฆ‹ใˆใพใ™ใ€‚ใ‚ใชใŸใฎ็ฌ‘้ก”ใŒ็งใฎๆ—ฅใ€…ใ‚’ๆ˜Žใ‚‹ใ็…งใ‚‰ใ—ใฆใ„ใพใ™ใ€‚ใ‚ใชใŸใŒใ„ใชใ„ๆ—ฅใฏใ€ใพใ‚‹ใงๅ†ฌใฎใ‚ˆใ†ใซๅฏ’ใใ€ๆš—ใ„ใงใ™.")
embscale = gr.Slider(minimum=1, maximum=10, value=1, step=0.1, label="Embedding Scale (READ WARNING BELOW)", info="Defaults to 1. ใ“ใ‚Œใ‚’ไธŠใ’ใŸใ‚‰ใƒ‘ใƒ•ใ‚ฉใƒผใƒžใƒณใ‚นใŒใ‚‚ใฃใจใ‚จใƒขใƒผใ‚ทใƒงใƒŠใƒซใซใชใ‚‹ใ€ๅข—ใ‚„ใ—ใ™ใŽใ‚‹ใจใ ใ‚ใซใชใ‚‹ใฎใงใ€ใ”ๆณจๆ„ใใ ใ•ใ„", interactive=True)
ljsteps = gr.Slider(minimum=3, maximum=20, value=3, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
with gr.Column(scale=1):
ljbtn = gr.Button("Synthesize", variant="primary")
ljaudio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
ljbtn.click(ljsynthesize, inputs=[ljinp, ljsteps, embscale], outputs=[ljaudio], concurrency_limit=4)
with gr.Blocks(title="StyleTTS 2", css="footer{display:none !important}", theme="NoCrypt/miku") as demo:
gr.Markdown(INTROTXT)
gr.DuplicateButton("Duplicate Space")
# gr.TabbedInterface([vctk, clone, lj, longText], ['Multi-Voice', 'Voice Cloning', 'Text-guided Inference', 'Long Text [Beta]'])
gr.TabbedInterface([vctk, clone, lj], ['With Reference Audio', '|do not use this option|','Text-guided Inference', 'Long Text [Beta]'])
gr.Markdown("""
the base code was borrowed from -> [mrfakename](https://twitter.com/realmrfakename). Neither of use are affiliated with the StyleTTS 2 authors.
""") # Please do not remove this line.
if __name__ == "__main__":
# demo.queue(api_open=False, max_size=15).launch(show_api=False)
demo.queue(api_open=False, max_size=15).launch(show_api=False)