Spaces:
Running
on
L4
Running
on
L4
Added examples
Browse files- app.py +17 -1
- descriptions.py +5 -0
- examples.py +24 -0
- prompts/en-1.wav +0 -0
- prompts/en-2.wav +0 -0
- prompts/ja-1.wav +0 -0
- prompts/ja-2.ogg +0 -0
- prompts/zh-1.wav +0 -0
- prompts/zh-2.wav +0 -0
app.py
CHANGED
@@ -31,6 +31,7 @@ from models.vallex import VALLE
|
|
31 |
from utils.g2p import PhonemeBpeTokenizer
|
32 |
from descriptions import *
|
33 |
from macros import *
|
|
|
34 |
|
35 |
import gradio as gr
|
36 |
import whisper
|
@@ -503,6 +504,11 @@ def main():
|
|
503 |
btn_mp.click(make_npz_prompt,
|
504 |
inputs=[textbox_mp, upload_audio_prompt, record_audio_prompt, textbox_transcript],
|
505 |
outputs=[text_output, prompt_output])
|
|
|
|
|
|
|
|
|
|
|
506 |
with gr.Tab("Make prompt"):
|
507 |
gr.Markdown(make_prompt_md)
|
508 |
with gr.Row():
|
@@ -523,6 +529,11 @@ def main():
|
|
523 |
btn_2.click(make_npz_prompt,
|
524 |
inputs=[textbox2, upload_audio_prompt_2, record_audio_prompt_2, textbox_transcript2],
|
525 |
outputs=[text_output_2, prompt_output_2])
|
|
|
|
|
|
|
|
|
|
|
526 |
with gr.Tab("Infer from prompt"):
|
527 |
gr.Markdown(infer_from_prompt_md)
|
528 |
with gr.Row():
|
@@ -543,8 +554,13 @@ def main():
|
|
543 |
btn_3.click(infer_from_prompt,
|
544 |
inputs=[textbox_3, language_dropdown_3, accent_dropdown_3, preset_dropdown_3, prompt_file],
|
545 |
outputs=[text_output_3, audio_output_3])
|
|
|
|
|
|
|
|
|
|
|
546 |
with gr.Tab("Infer long text"):
|
547 |
-
gr.Markdown(
|
548 |
with gr.Row():
|
549 |
with gr.Column():
|
550 |
textbox_4 = gr.TextArea(label="Text",
|
|
|
31 |
from utils.g2p import PhonemeBpeTokenizer
|
32 |
from descriptions import *
|
33 |
from macros import *
|
34 |
+
from examples import *
|
35 |
|
36 |
import gradio as gr
|
37 |
import whisper
|
|
|
504 |
btn_mp.click(make_npz_prompt,
|
505 |
inputs=[textbox_mp, upload_audio_prompt, record_audio_prompt, textbox_transcript],
|
506 |
outputs=[text_output, prompt_output])
|
507 |
+
gr.Examples(examples=infer_from_audio_examples,
|
508 |
+
inputs=[textbox, language_dropdown, accent_dropdown, upload_audio_prompt, record_audio_prompt, textbox_transcript],
|
509 |
+
outputs=[text_output, audio_output],
|
510 |
+
fn=infer_from_audio,
|
511 |
+
cache_examples=True,)
|
512 |
with gr.Tab("Make prompt"):
|
513 |
gr.Markdown(make_prompt_md)
|
514 |
with gr.Row():
|
|
|
529 |
btn_2.click(make_npz_prompt,
|
530 |
inputs=[textbox2, upload_audio_prompt_2, record_audio_prompt_2, textbox_transcript2],
|
531 |
outputs=[text_output_2, prompt_output_2])
|
532 |
+
gr.Examples(examples=make_npz_prompt_examples,
|
533 |
+
inputs=[textbox2, upload_audio_prompt_2, record_audio_prompt_2, textbox_transcript2],
|
534 |
+
outputs=[text_output_2, prompt_output_2],
|
535 |
+
fn=make_npz_prompt,
|
536 |
+
cache_examples=True,)
|
537 |
with gr.Tab("Infer from prompt"):
|
538 |
gr.Markdown(infer_from_prompt_md)
|
539 |
with gr.Row():
|
|
|
554 |
btn_3.click(infer_from_prompt,
|
555 |
inputs=[textbox_3, language_dropdown_3, accent_dropdown_3, preset_dropdown_3, prompt_file],
|
556 |
outputs=[text_output_3, audio_output_3])
|
557 |
+
gr.Examples(examples=infer_from_prompt_examples,
|
558 |
+
inputs=[textbox_3, language_dropdown_3, accent_dropdown_3, preset_dropdown_3, prompt_file],
|
559 |
+
outputs=[text_output_3, audio_output_3],
|
560 |
+
fn=infer_from_prompt,
|
561 |
+
cache_examples=True,)
|
562 |
with gr.Tab("Infer long text"):
|
563 |
+
gr.Markdown(long_text_md)
|
564 |
with gr.Row():
|
565 |
with gr.Column():
|
566 |
textbox_4 = gr.TextArea(label="Text",
|
descriptions.py
CHANGED
@@ -24,4 +24,9 @@ Faster than **"Infer from audio"**.<br>
|
|
24 |
You need to **"Make prompt"** first, and upload the encoded prompt (a `.npz` file)
|
25 |
"""
|
26 |
|
|
|
|
|
|
|
|
|
|
|
27 |
long_text_example = "Just a few years ago, there were no legions of deep learning scientists developing intelligent products and services at major companies and startups. When we entered the field, machine learning did not command headlines in daily newspapers. Our parents had no idea what machine learning was, let alone why we might prefer it to a career in medicine or law. Machine learning was a blue skies academic discipline whose industrial significance was limited to a narrow set of real-world applications, including speech recognition and computer vision. Moreover, many of these applications required so much domain knowledge that they were often regarded as entirely separate areas for which machine learning was one small component. At that time, neural networks—the predecessors of the deep learning methods that we focus on in this book—were generally regarded as outmoded."
|
|
|
24 |
You need to **"Make prompt"** first, and upload the encoded prompt (a `.npz` file)
|
25 |
"""
|
26 |
|
27 |
+
long_text_md = """
|
28 |
+
Very long text is chunked into several sentences, and each sentence is synthesized separately.<br>
|
29 |
+
Please make a prompt or use a preset prompt to infer long text.
|
30 |
+
"""
|
31 |
+
|
32 |
long_text_example = "Just a few years ago, there were no legions of deep learning scientists developing intelligent products and services at major companies and startups. When we entered the field, machine learning did not command headlines in daily newspapers. Our parents had no idea what machine learning was, let alone why we might prefer it to a career in medicine or law. Machine learning was a blue skies academic discipline whose industrial significance was limited to a narrow set of real-world applications, including speech recognition and computer vision. Moreover, many of these applications required so much domain knowledge that they were often regarded as entirely separate areas for which machine learning was one small component. At that time, neural networks—the predecessors of the deep learning methods that we focus on in this book—were generally regarded as outmoded."
|
examples.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
infer_from_audio_examples = [
|
2 |
+
["This is how this machine has taken my voice.", 'English', 'no-accent', "prompts/en-2.wav", None, "Wow, look at that! That's no ordinary Teddy bear!"],
|
3 |
+
["我喜欢抽电子烟,尤其是锐刻五代。", '中文', 'no-accent', "prompts/zh-1.wav", None, "今天我很荣幸,"],
|
4 |
+
["私の声を真似するのはそんなに面白いですか?", '日本語', 'no-accent', "prompts/ja-2.ogg", None, "初めまして、朝武よしのです。"],
|
5 |
+
["你可以听得出来我有多困。", '中文', 'no-accent', "prompts/en-1.wav", None, ""],
|
6 |
+
["この文は、クロスリンガル合成の例です。", '日本語', 'no-accent', "prompts/zh-2.wav", None, ""],
|
7 |
+
["Actually, I can't speak English, but this machine helped me do it.", 'English', 'no-accent', "prompts/ja-1.wav", None, ""],
|
8 |
+
]
|
9 |
+
|
10 |
+
make_npz_prompt_examples = [
|
11 |
+
["Gem-trader", "prompts/en-2.wav", None, "Wow, look at that! That's no ordinary Teddy bear!"],
|
12 |
+
["Ding Zhen", "prompts/zh-1.wav", None, "今天我很荣幸,"],
|
13 |
+
["Yoshino", "prompts/ja-2.ogg", None, "初めまして、朝武よしのです。"],
|
14 |
+
["Sleepy-woman", "prompts/en-1.wav", None, ""],
|
15 |
+
["Yae", "prompts/zh-2.wav", None, ""],
|
16 |
+
["Cafe", "prompts/ja-1.wav", None, ""],
|
17 |
+
]
|
18 |
+
|
19 |
+
infer_from_prompt_examples = [
|
20 |
+
["A prompt contains voice, prosody and emotion information of a certain speaker.", "English", "no-accent", "vctk_1", None],
|
21 |
+
["This prompt is made with an audio of three seconds.", "English", "no-accent", "librispeech_1", None],
|
22 |
+
["This prompt is made with Chinese speech", "English", "no-accent", "seel", None],
|
23 |
+
]
|
24 |
+
|
prompts/en-1.wav
ADDED
Binary file (213 kB). View file
|
|
prompts/en-2.wav
ADDED
Binary file (552 kB). View file
|
|
prompts/ja-1.wav
ADDED
Binary file (195 kB). View file
|
|
prompts/ja-2.ogg
ADDED
Binary file (31.4 kB). View file
|
|
prompts/zh-1.wav
ADDED
Binary file (176 kB). View file
|
|
prompts/zh-2.wav
ADDED
Binary file (272 kB). View file
|
|