Spaces:
Running
on
Zero
Running
on
Zero
Upload 2 files
Browse files- app.py +26 -21
- requirements.txt +1 -0
app.py
CHANGED
@@ -3,6 +3,7 @@ from huggingface_hub import snapshot_download
|
|
3 |
from katsu import Katsu
|
4 |
from models import build_model
|
5 |
import gradio as gr
|
|
|
6 |
import numpy as np
|
7 |
import os
|
8 |
import phonemizer
|
@@ -269,8 +270,17 @@ def clamp_speed(speed):
|
|
269 |
return 2
|
270 |
return speed
|
271 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
272 |
# Must be backwards compatible with https://huggingface.co/spaces/Pendrokar/TTS-Spaces-Arena
|
273 |
-
def generate(text, voice='af', ps=None, speed=1,
|
274 |
ps = ps or phonemize(text, voice)
|
275 |
if not sk and (text in sents or ps.strip('"') in harvsents):
|
276 |
sk = os.environ['SK']
|
@@ -278,7 +288,7 @@ def generate(text, voice='af', ps=None, speed=1, trim=3000, use_gpu='auto', sk=N
|
|
278 |
return (None, '')
|
279 |
voices = resolve_voices(voice, warn=ps)
|
280 |
speed = clamp_speed(speed)
|
281 |
-
|
282 |
use_gpu = use_gpu if use_gpu in ('auto', False, True) else 'auto'
|
283 |
tokens = tokenize(ps)
|
284 |
if not tokens:
|
@@ -302,11 +312,8 @@ def generate(text, voice='af', ps=None, speed=1, trim=3000, use_gpu='auto', sk=N
|
|
302 |
raise gr.Error(e)
|
303 |
print(debug, datetime.now(), voices, len(ps), use_gpu, repr(e))
|
304 |
return (None, '')
|
305 |
-
|
306 |
-
|
307 |
-
if trim * 2 >= len(out):
|
308 |
-
return (None, '')
|
309 |
-
out = out[trim:-trim]
|
310 |
print(debug, datetime.now(), voices, len(ps), use_gpu, len(out))
|
311 |
return ((SAMPLE_RATE, out), ps)
|
312 |
|
@@ -352,7 +359,7 @@ with gr.Blocks() as basic_tts:
|
|
352 |
autoplay = gr.Checkbox(value=True, label='Autoplay')
|
353 |
autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
|
354 |
speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Speed', info='Adjust the speaking speed')
|
355 |
-
|
356 |
with gr.Accordion('Output Tokens', open=True):
|
357 |
out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 allowed. Same as input tokens if supplied, excluding unknowns.')
|
358 |
with gr.Accordion('Voice Mixer', open=False):
|
@@ -367,8 +374,8 @@ with gr.Blocks() as basic_tts:
|
|
367 |
with gr.Row():
|
368 |
sk = gr.Textbox(visible=False)
|
369 |
text.change(lambda: os.environ['SK'], outputs=[sk])
|
370 |
-
text.submit(generate, inputs=[text, voice, in_ps, speed,
|
371 |
-
generate_btn.click(generate, inputs=[text, voice, in_ps, speed,
|
372 |
|
373 |
@torch.no_grad()
|
374 |
def lf_forward(token_lists, voices, speed, sk, device='cpu'):
|
@@ -457,14 +464,14 @@ def segment_and_tokenize(text, voice, skip_square_brackets=True, newline_split=2
|
|
457 |
segments = [row for t in texts for row in recursive_split(t, voice)]
|
458 |
return [(i, *row) for i, row in enumerate(segments)]
|
459 |
|
460 |
-
def lf_generate(segments, voice, speed=1,
|
461 |
if sk != os.environ['SK']:
|
462 |
return
|
463 |
token_lists = list(map(tokenize, segments['Tokens']))
|
464 |
voices = resolve_voices(voice)
|
465 |
speed = clamp_speed(speed)
|
466 |
-
|
467 |
-
pad_between = int(pad_between
|
468 |
use_gpu = True
|
469 |
batch_sizes = [89, 55, 34, 21, 13, 8, 5, 3, 2, 1, 1]
|
470 |
i = 0
|
@@ -489,10 +496,8 @@ def lf_generate(segments, voice, speed=1, trim=0, pad_between=0, use_gpu=True, s
|
|
489 |
else:
|
490 |
raise gr.Error(e)
|
491 |
for out in outs:
|
492 |
-
if
|
493 |
-
|
494 |
-
continue
|
495 |
-
out = out[trim:-trim]
|
496 |
if i > 0 and pad_between > 0:
|
497 |
yield (SAMPLE_RATE, np.zeros(pad_between))
|
498 |
yield (SAMPLE_RATE, out)
|
@@ -537,8 +542,8 @@ with gr.Blocks() as lf_tts:
|
|
537 |
audio_stream = gr.Audio(label='Output Audio Stream', interactive=False, streaming=True, autoplay=True)
|
538 |
with gr.Accordion('Audio Settings', open=True):
|
539 |
speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Speed', info='Adjust the speaking speed')
|
540 |
-
|
541 |
-
pad_between = gr.Slider(minimum=0, maximum=24000, value=0, step=1000, label='🔇 Pad Between', info='How
|
542 |
with gr.Row():
|
543 |
segment_btn = gr.Button('Tokenize', variant='primary')
|
544 |
generate_btn = gr.Button('Generate x0', variant='secondary', interactive=False)
|
@@ -550,12 +555,12 @@ with gr.Blocks() as lf_tts:
|
|
550 |
sk = gr.Textbox(visible=False)
|
551 |
segments.change(lambda: os.environ['SK'], outputs=[sk])
|
552 |
segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
|
553 |
-
generate_event = generate_btn.click(lf_generate, inputs=[segments, voice, speed,
|
554 |
stop_btn.click(fn=None, cancels=generate_event)
|
555 |
|
556 |
with gr.Blocks() as about:
|
557 |
gr.Markdown('''
|
558 |
-
Kokoro is a frontier TTS model for its size. It has [80 million](https://hf.co/spaces/hexgrad/Kokoro-TTS/blob/main/app.py#
|
559 |
|
560 |
### FAQ
|
561 |
**Will this be open sourced?**<br/>
|
|
|
3 |
from katsu import Katsu
|
4 |
from models import build_model
|
5 |
import gradio as gr
|
6 |
+
import librosa
|
7 |
import numpy as np
|
8 |
import os
|
9 |
import phonemizer
|
|
|
270 |
return 2
|
271 |
return speed
|
272 |
|
273 |
+
def clamp_top_db(top_db):
|
274 |
+
if not isinstance(top_db, float) and not isinstance(top_db, int):
|
275 |
+
return 60
|
276 |
+
elif top_db < 30:
|
277 |
+
return None
|
278 |
+
elif top_db > 90:
|
279 |
+
return 90
|
280 |
+
return top_db
|
281 |
+
|
282 |
# Must be backwards compatible with https://huggingface.co/spaces/Pendrokar/TTS-Spaces-Arena
|
283 |
+
def generate(text, voice='af', ps=None, speed=1, top_db=60, use_gpu='auto', sk=None):
|
284 |
ps = ps or phonemize(text, voice)
|
285 |
if not sk and (text in sents or ps.strip('"') in harvsents):
|
286 |
sk = os.environ['SK']
|
|
|
288 |
return (None, '')
|
289 |
voices = resolve_voices(voice, warn=ps)
|
290 |
speed = clamp_speed(speed)
|
291 |
+
top_db = clamp_top_db(top_db)
|
292 |
use_gpu = use_gpu if use_gpu in ('auto', False, True) else 'auto'
|
293 |
tokens = tokenize(ps)
|
294 |
if not tokens:
|
|
|
312 |
raise gr.Error(e)
|
313 |
print(debug, datetime.now(), voices, len(ps), use_gpu, repr(e))
|
314 |
return (None, '')
|
315 |
+
if top_db:
|
316 |
+
out, _ = librosa.effects.trim(out, top_db=top_db)
|
|
|
|
|
|
|
317 |
print(debug, datetime.now(), voices, len(ps), use_gpu, len(out))
|
318 |
return ((SAMPLE_RATE, out), ps)
|
319 |
|
|
|
359 |
autoplay = gr.Checkbox(value=True, label='Autoplay')
|
360 |
autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
|
361 |
speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Speed', info='Adjust the speaking speed')
|
362 |
+
top_db = gr.Slider(minimum=0, maximum=90, value=60, step=30, label='✂️ Trim top_db (librosa.effects.trim)', info='Threshold (in db) below peak to trim')
|
363 |
with gr.Accordion('Output Tokens', open=True):
|
364 |
out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 allowed. Same as input tokens if supplied, excluding unknowns.')
|
365 |
with gr.Accordion('Voice Mixer', open=False):
|
|
|
374 |
with gr.Row():
|
375 |
sk = gr.Textbox(visible=False)
|
376 |
text.change(lambda: os.environ['SK'], outputs=[sk])
|
377 |
+
text.submit(generate, inputs=[text, voice, in_ps, speed, top_db, use_gpu, sk], outputs=[audio, out_ps])
|
378 |
+
generate_btn.click(generate, inputs=[text, voice, in_ps, speed, top_db, use_gpu, sk], outputs=[audio, out_ps])
|
379 |
|
380 |
@torch.no_grad()
|
381 |
def lf_forward(token_lists, voices, speed, sk, device='cpu'):
|
|
|
464 |
segments = [row for t in texts for row in recursive_split(t, voice)]
|
465 |
return [(i, *row) for i, row in enumerate(segments)]
|
466 |
|
467 |
+
def lf_generate(segments, voice, speed=1, top_db=0, pad_between=0, use_gpu=True, sk=None):
|
468 |
if sk != os.environ['SK']:
|
469 |
return
|
470 |
token_lists = list(map(tokenize, segments['Tokens']))
|
471 |
voices = resolve_voices(voice)
|
472 |
speed = clamp_speed(speed)
|
473 |
+
top_db = clamp_top_db(top_db)
|
474 |
+
pad_between = int(pad_between)
|
475 |
use_gpu = True
|
476 |
batch_sizes = [89, 55, 34, 21, 13, 8, 5, 3, 2, 1, 1]
|
477 |
i = 0
|
|
|
496 |
else:
|
497 |
raise gr.Error(e)
|
498 |
for out in outs:
|
499 |
+
if top_db:
|
500 |
+
out, _ = librosa.effects.trim(out, top_db=top_db)
|
|
|
|
|
501 |
if i > 0 and pad_between > 0:
|
502 |
yield (SAMPLE_RATE, np.zeros(pad_between))
|
503 |
yield (SAMPLE_RATE, out)
|
|
|
542 |
audio_stream = gr.Audio(label='Output Audio Stream', interactive=False, streaming=True, autoplay=True)
|
543 |
with gr.Accordion('Audio Settings', open=True):
|
544 |
speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Speed', info='Adjust the speaking speed')
|
545 |
+
top_db = gr.Slider(minimum=0, maximum=90, value=0, step=30, label='✂️ Trim top_db (librosa.effects.trim)', info='Threshold (in db) below peak to trim')
|
546 |
+
pad_between = gr.Slider(minimum=0, maximum=24000, value=0, step=1000, label='🔇 Pad Between', info='How many silent samples to insert between segments')
|
547 |
with gr.Row():
|
548 |
segment_btn = gr.Button('Tokenize', variant='primary')
|
549 |
generate_btn = gr.Button('Generate x0', variant='secondary', interactive=False)
|
|
|
555 |
sk = gr.Textbox(visible=False)
|
556 |
segments.change(lambda: os.environ['SK'], outputs=[sk])
|
557 |
segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
|
558 |
+
generate_event = generate_btn.click(lf_generate, inputs=[segments, voice, speed, top_db, pad_between, use_gpu, sk], outputs=[audio_stream])
|
559 |
stop_btn.click(fn=None, cancels=generate_event)
|
560 |
|
561 |
with gr.Blocks() as about:
|
562 |
gr.Markdown('''
|
563 |
+
Kokoro is a frontier TTS model for its size. It has [80 million](https://hf.co/spaces/hexgrad/Kokoro-TTS/blob/main/app.py#L34) parameters, uses a lean [StyleTTS 2](https://github.com/yl4579/StyleTTS2) architecture, and was trained on high-quality data. The weights are currently private, but a free public demo is hosted here, at `https://hf.co/spaces/hexgrad/Kokoro-TTS`. The Community tab is open for feature requests, bug reports, etc. For other inquiries, contact `@rzvzn` on Discord.
|
564 |
|
565 |
### FAQ
|
566 |
**Will this be open sourced?**<br/>
|
requirements.txt
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
fugashi
|
2 |
gradio
|
|
|
3 |
mojimoji
|
4 |
munch
|
5 |
phonemizer
|
|
|
1 |
fugashi
|
2 |
gradio
|
3 |
+
librosa
|
4 |
mojimoji
|
5 |
munch
|
6 |
phonemizer
|