Spaces:

ShoukanLabs
/

Vokan

Running on Zero

App Files Files Community

Vokan / app.py

Korakoe

Update app.py

75eb448 verified 5 months ago

raw

history blame

No virus

7.84 kB

	import gradio as gr
	import spaces
	from styletts2 import tts
	import re
	import numpy as np
	from scipy.io.wavfile import write
	import nltk

	nltk.download('punkt')
	from nltk.tokenize import word_tokenize

	import torch

	import phonemizer # en-us

	INTRO = """
	<style>

	.TitleContainer {
	background-color: #ffff;
	margin-bottom: 0rem;
	margin-left: auto;
	margin-right: auto;
	width: 40%;
	height: 30%;
	border-radius: 10rem;
	border: 0.5vw solid #ff593e;
	text-align: center;
	display: flex;
	justify-content: center;
	transition: .6s;
	}

	.TitleContainer:hover {
	transform: scale(1.05);
	}

	.VokanLogo {
	margin: auto;
	display: block;
	}

	</style>

	<div class="TitleContainer">
	<img src="https://huggingface.co/spaces/ShoukanLabs/Vokan/resolve/main/Vokan.gif" class="VokanLogo">
	</div>

	<p align="center", style="font-size: 1vw; font-weight: bold; color: #ff593e;">A StyleTTS2 fine-tune, designed for expressiveness.</p>

	<hr>
	"""



	js_func = """
	function refresh() {
	const url = new URL(window.location);

	if (url.searchParams.get('__theme') !== 'light') {
	url.searchParams.set('__theme', 'light');
	window.location.href = url.href;
	}
	}
	"""

	theme = gr.themes.Soft(
	primary_hue=gr.themes.Color(c100="#ffd7d1", c200="#ff593e", c300="#ff593e", c400="#ff593e", c50="#fff0f0", c500="#ff593e", c600="#ea580c", c700="#c2410c", c800="#9a3412", c900="#7c2d12", c950="#6c2e12"),
	secondary_hue="orange",
	radius_size=gr.themes.Size(lg="20px", md="8px", sm="6px", xl="30px", xs="4px", xxl="40px", xxs="2px"),
	font=[gr.themes.GoogleFont('M PLUS Rounded 1c'), 'ui-sans-serif', 'system-ui', 'sans-serif'],
	).set(
	block_background_fill='*neutral_50'
	)

	global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us',
	preserve_punctuation=True,
	with_stress=True,
	language_switch="remove-flags",
	tie=False)


	def split_and_recombine_text(text, desired_length=200, max_length=300):
	"""Split text it into chunks of a desired length trying to keep sentences intact."""
	# normalize text, remove redundant whitespace and convert non-ascii quotes to ascii
	text = re.sub(r'\n\n+', '\n', text)
	text = re.sub(r'\s+', ' ', text)
	text = re.sub(r'[“”]', '"', text)

	rv = []
	in_quote = False
	current = ""
	split_pos = []
	pos = -1
	end_pos = len(text) - 1

	def seek(delta):
	nonlocal pos, in_quote, current
	is_neg = delta < 0
	for _ in range(abs(delta)):
	if is_neg:
	pos -= 1
	current = current[:-1]
	else:
	pos += 1
	current += text[pos]
	if text[pos] == '"':
	in_quote = not in_quote
	return text[pos]

	def peek(delta):
	p = pos + delta
	return text[p] if p < end_pos and p >= 0 else ""

	def commit():
	nonlocal rv, current, split_pos
	rv.append(current)
	current = ""
	split_pos = []

	while pos < end_pos:
	c = seek(1)
	# do we need to force a split?
	if len(current) >= max_length:
	if len(split_pos) > 0 and len(current) > (desired_length / 2):
	# we have at least one sentence and we are over half the desired length, seek back to the last split
	d = pos - split_pos[-1]
	seek(-d)
	else:
	# no full sentences, seek back until we are not in the middle of a word and split there
	while c not in '!?.\n ' and pos > 0 and len(current) > desired_length:
	c = seek(-1)
	commit()
	# check for sentence boundaries
	elif not in_quote and (c in '!?\n' or (c == '.' and peek(1) in '\n ')):
	# seek forward if we have consecutive boundary markers but still within the max length
	while pos < len(text) - 1 and len(current) < max_length and peek(1) in '!?.':
	c = seek(1)
	split_pos.append(pos)
	if len(current) >= desired_length:
	commit()
	# treat end of quote as a boundary if its followed by a space or newline
	elif in_quote and peek(1) == '"' and peek(2) in '\n ':
	seek(2)
	split_pos.append(pos)
	rv.append(current)

	# clean up, remove lines with only whitespace or punctuation
	rv = [s.strip() for s in rv]
	rv = [s for s in rv if len(s) > 0 and not re.match(r'^[\s\.,;:!?]*$', s)]

	return rv


	def text_to_phonemes(text):
	text = text.strip()
	print("Text before phonemization: ", text)
	ps = global_phonemizer.phonemize([text])
	print("Text after phonemization: ", ps)
	ps = word_tokenize(ps[0])
	ps = ' '.join(ps)
	print("Final text after tokenization: ", ps)
	return ps


	@spaces.GPU
	def generate(audio_path, ins, speed, alpha, beta, embedding, steps=100):
	ref_s = other_tts.compute_style(audio_path)
	print(ref_s.size())
	s_prev = None

	texts = split_and_recombine_text(ins)
	audio = np.array([])

	for i in texts:
	i = text_to_phonemes(i)
	synthaud, s_prev = other_tts.long_inference_segment(i, diffusion_steps=steps,
	alpha=alpha, beta=beta, is_phonemes=True,
	embedding_scale=embedding, prev_s=s_prev, ref_s=ref_s,
	speed=speed, t=0.7)
	audio = np.concatenate((audio, synthaud))
	scaled = np.int16(audio / np.max(np.abs(audio)) * 32767)

	return 24000, scaled

	if torch.cuda.is_available():
	other_tts = tts.StyleTTS2(model_checkpoint_path='./epoch_2nd_00012.pth', config_path="models/config_ft.yml")
	else:
	other_tts = None

	with gr.Blocks(theme=theme, js=js_func) as clone:
	gr.HTML(INTRO)
	with gr.Row():
	with gr.Column(scale=1):
	inp = gr.Textbox(label="Text", info="What do you want Vokan to say?", interactive=True)
	voice = gr.Audio(label="Voice", interactive=True, type='filepath', max_length=300, waveform_options={'waveform_progress_color': '#FF593E'})
	steps = gr.Slider(minimum=3, maximum=100, value=20, step=1, label="Diffusion Steps", info="Higher produces better results typically", interactive=True)
	embscale = gr.Slider(minimum=1, maximum=10, value=2, step=0.1, label="Embedding Scale", info="Defaults to 2 \| High scales may produce unexpected results but may produce more emotional texts", interactive=True)
	alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha", info="Defaults to 0.3 \| Resemblance to speakers voice - lower = more similar", interactive=True)
	beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1, label="Beta", info="Defaults to 0.7 \| Resemblance to speakers prosody - lower = more similar - higher = based on sentence", interactive=True)
	speed = gr.Slider(minimum=0.5, maximum=1.5, value=1, step=0.1, label="Speed of speech", info="Defaults to 1", interactive=True)
	with gr.Column(scale=1):
	clbtn = gr.Button("Synthesize", variant="primary")
	claudio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#FF593E'})
	clbtn.click(generate, inputs=[voice, inp, speed, alpha, beta, embscale, steps], outputs=[claudio], concurrency_limit=4)

	if __name__ == "__main__":
	# demo.queue(api_open=False, max_size=15).launch(show_api=False)
	clone.queue(api_open=False, max_size=15).launch(show_api=False)