Spaces:
Running
on
Zero
Running
on
Zero
Upload 2 files
Browse files
README.md
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
---
|
| 2 |
-
title: Kokoro TTS
|
| 3 |
-
emoji:
|
| 4 |
colorFrom: indigo
|
| 5 |
colorTo: purple
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 5.
|
| 8 |
app_file: app.py
|
| 9 |
pinned: true
|
| 10 |
license: mit
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Kokoro TTS v0.19
|
| 3 |
+
emoji: 🔊♥️🔊
|
| 4 |
colorFrom: indigo
|
| 5 |
colorTo: purple
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 5.6.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: true
|
| 10 |
license: mit
|
app.py
CHANGED
|
@@ -108,16 +108,17 @@ VOCAB = get_vocab()
|
|
| 108 |
def tokenize(ps):
|
| 109 |
return [i for i in map(VOCAB.get, ps) if i is not None]
|
| 110 |
|
| 111 |
-
# ⭐ Starred voices are
|
| 112 |
CHOICES = {
|
| 113 |
'🇺🇸 🚺 American Female ⭐': 'af',
|
| 114 |
'🇺🇸 🚺 Bella': 'af_bella',
|
|
|
|
| 115 |
'🇺🇸 🚺 Sarah': 'af_sarah',
|
| 116 |
'🇺🇸 🚺 Sky 🧪': 'af_sky',
|
| 117 |
'🇺🇸 🚹 Adam 🧪': 'am_adam',
|
| 118 |
'🇺🇸 🚹 Michael': 'am_michael',
|
| 119 |
-
'🇬🇧 🚹 Lewis': 'bm_lewis',
|
| 120 |
-
'🇯🇵 🚺 Japanese Female
|
| 121 |
}
|
| 122 |
VOICES = {k: torch.load(os.path.join(snapshot, 'voicepacks', f'{k}.pt'), weights_only=True).to(device) for k in CHOICES.values()}
|
| 123 |
|
|
@@ -159,7 +160,7 @@ def forward(tokens, voice, speed):
|
|
| 159 |
asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
|
| 160 |
return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
|
| 161 |
|
| 162 |
-
def generate(text, voice, ps=None, speed=1.0, opening_cut=4000, closing_cut=2000, ease_in=3000, ease_out=1000, pad_before=
|
| 163 |
if voice not in VOICES:
|
| 164 |
# Ensure stability for https://huggingface.co/spaces/Pendrokar/TTS-Spaces-Arena
|
| 165 |
voice = 'af'
|
|
@@ -204,7 +205,7 @@ with gr.Blocks() as basic_tts:
|
|
| 204 |
with gr.Row():
|
| 205 |
with gr.Column():
|
| 206 |
text = gr.Textbox(label='Input Text')
|
| 207 |
-
voice = gr.Dropdown(list(CHOICES.items()), label='Voice', info='⭐ Starred voices are
|
| 208 |
with gr.Row():
|
| 209 |
random_btn = gr.Button('Random Text', variant='secondary')
|
| 210 |
generate_btn = gr.Button('Generate', variant='primary')
|
|
@@ -236,9 +237,9 @@ with gr.Blocks() as basic_tts:
|
|
| 236 |
ease_out = gr.Slider(minimum=0, maximum=24000, value=1000, step=1000, label='Ease Out', info='📐 Ease out for this many samples, before closing cut.')
|
| 237 |
with gr.Row():
|
| 238 |
with gr.Column():
|
| 239 |
-
pad_before = gr.Slider(minimum=0, maximum=24000, value=
|
| 240 |
with gr.Column():
|
| 241 |
-
pad_after = gr.Slider(minimum=0, maximum=24000, value=
|
| 242 |
autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
|
| 243 |
text.submit(generate, inputs=[text, voice, in_ps, speed, opening_cut, closing_cut, ease_in, ease_out, pad_before, pad_after], outputs=[audio, out_ps])
|
| 244 |
generate_btn.click(generate, inputs=[text, voice, in_ps, speed, opening_cut, closing_cut, ease_in, ease_out, pad_before, pad_after], outputs=[audio, out_ps])
|
|
@@ -388,7 +389,7 @@ with gr.Blocks() as lf_tts:
|
|
| 388 |
file_input = gr.File(file_types=['.pdf', '.txt'], label='Input File: pdf or txt')
|
| 389 |
text = gr.Textbox(label='Input Text')
|
| 390 |
file_input.upload(fn=extract_text, inputs=[file_input], outputs=[text])
|
| 391 |
-
voice = gr.Dropdown(list(CHOICES.items()), label='Voice', info='⭐ Starred voices are
|
| 392 |
with gr.Accordion('Text Settings', open=False):
|
| 393 |
skip_square_brackets = gr.Checkbox(True, label='Skip [Square Brackets]', info='Recommended for academic papers, Wikipedia articles, or texts with citations.')
|
| 394 |
newline_split = gr.Number(2, label='Newline Split', info='Split the input text on this many newlines. Affects how the text is segmented.', precision=0, minimum=0)
|
|
@@ -434,6 +435,10 @@ The model was trained on 1x A100-class 80GB instances rented from [Vast.ai](http
|
|
| 434 |
Vast was chosen over other compute providers due to its competitive on-demand hourly rates.<br/>
|
| 435 |
The average hourly cost for the 1x A100-class 80GB VRAM instances used for training was below $1/hr — around half the quoted rates from other providers.
|
| 436 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 437 |
### Licenses
|
| 438 |
Inference code: MIT<br/>
|
| 439 |
espeak-ng dependency: GPL-3.0<sup>[4]</sup><br/>
|
|
|
|
| 108 |
def tokenize(ps):
|
| 109 |
return [i for i in map(VOCAB.get, ps) if i is not None]
|
| 110 |
|
| 111 |
+
# ⭐ Starred voices are more stable. 🧪 Experimental voices are less stable.
|
| 112 |
CHOICES = {
|
| 113 |
'🇺🇸 🚺 American Female ⭐': 'af',
|
| 114 |
'🇺🇸 🚺 Bella': 'af_bella',
|
| 115 |
+
'🇺🇸 🚺 Nicole': 'af_nicole',
|
| 116 |
'🇺🇸 🚺 Sarah': 'af_sarah',
|
| 117 |
'🇺🇸 🚺 Sky 🧪': 'af_sky',
|
| 118 |
'🇺🇸 🚹 Adam 🧪': 'am_adam',
|
| 119 |
'🇺🇸 🚹 Michael': 'am_michael',
|
| 120 |
+
'🇬🇧 🚹 Lewis 🧪': 'bm_lewis',
|
| 121 |
+
'🇯🇵 🚺 Japanese Female': 'jf_0',
|
| 122 |
}
|
| 123 |
VOICES = {k: torch.load(os.path.join(snapshot, 'voicepacks', f'{k}.pt'), weights_only=True).to(device) for k in CHOICES.values()}
|
| 124 |
|
|
|
|
| 160 |
asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
|
| 161 |
return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
|
| 162 |
|
| 163 |
+
def generate(text, voice, ps=None, speed=1.0, opening_cut=4000, closing_cut=2000, ease_in=3000, ease_out=1000, pad_before=0, pad_after=0):
|
| 164 |
if voice not in VOICES:
|
| 165 |
# Ensure stability for https://huggingface.co/spaces/Pendrokar/TTS-Spaces-Arena
|
| 166 |
voice = 'af'
|
|
|
|
| 205 |
with gr.Row():
|
| 206 |
with gr.Column():
|
| 207 |
text = gr.Textbox(label='Input Text')
|
| 208 |
+
voice = gr.Dropdown(list(CHOICES.items()), label='Voice', info='⭐ Starred voices are more stable. 🧪 Experimental voices are less stable.')
|
| 209 |
with gr.Row():
|
| 210 |
random_btn = gr.Button('Random Text', variant='secondary')
|
| 211 |
generate_btn = gr.Button('Generate', variant='primary')
|
|
|
|
| 237 |
ease_out = gr.Slider(minimum=0, maximum=24000, value=1000, step=1000, label='Ease Out', info='📐 Ease out for this many samples, before closing cut.')
|
| 238 |
with gr.Row():
|
| 239 |
with gr.Column():
|
| 240 |
+
pad_before = gr.Slider(minimum=0, maximum=24000, value=0, step=1000, label='Pad Before', info='🔇 How many samples of silence to insert before the start.')
|
| 241 |
with gr.Column():
|
| 242 |
+
pad_after = gr.Slider(minimum=0, maximum=24000, value=0, step=1000, label='Pad After', info='🔇 How many samples of silence to append after the end.')
|
| 243 |
autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
|
| 244 |
text.submit(generate, inputs=[text, voice, in_ps, speed, opening_cut, closing_cut, ease_in, ease_out, pad_before, pad_after], outputs=[audio, out_ps])
|
| 245 |
generate_btn.click(generate, inputs=[text, voice, in_ps, speed, opening_cut, closing_cut, ease_in, ease_out, pad_before, pad_after], outputs=[audio, out_ps])
|
|
|
|
| 389 |
file_input = gr.File(file_types=['.pdf', '.txt'], label='Input File: pdf or txt')
|
| 390 |
text = gr.Textbox(label='Input Text')
|
| 391 |
file_input.upload(fn=extract_text, inputs=[file_input], outputs=[text])
|
| 392 |
+
voice = gr.Dropdown(list(CHOICES.items()), label='Voice', info='⭐ Starred voices are more stable. 🧪 Experimental voices are less stable.')
|
| 393 |
with gr.Accordion('Text Settings', open=False):
|
| 394 |
skip_square_brackets = gr.Checkbox(True, label='Skip [Square Brackets]', info='Recommended for academic papers, Wikipedia articles, or texts with citations.')
|
| 395 |
newline_split = gr.Number(2, label='Newline Split', info='Split the input text on this many newlines. Affects how the text is segmented.', precision=0, minimum=0)
|
|
|
|
| 435 |
Vast was chosen over other compute providers due to its competitive on-demand hourly rates.<br/>
|
| 436 |
The average hourly cost for the 1x A100-class 80GB VRAM instances used for training was below $1/hr — around half the quoted rates from other providers.
|
| 437 |
|
| 438 |
+
### Voice Stability
|
| 439 |
+
⭐ Starred voices are more stable. 🧪 Experimental voices are less stable.<br/>
|
| 440 |
+
Unstable voices may be more likely to stumble or produce unnatural artifacts, especially on shorter texts.
|
| 441 |
+
|
| 442 |
### Licenses
|
| 443 |
Inference code: MIT<br/>
|
| 444 |
espeak-ng dependency: GPL-3.0<sup>[4]</sup><br/>
|