Spaces:

Plachta
/

VALL-E-X

Running on L4

App Files Files Community

Plachta commited on Aug 27, 2023

Commit

512efa6

•

1 Parent(s): 26adb3f

Added examples

Browse files

Files changed (9) hide show

app.py +17 -1
descriptions.py +5 -0
examples.py +24 -0
prompts/en-1.wav +0 -0
prompts/en-2.wav +0 -0
prompts/ja-1.wav +0 -0
prompts/ja-2.ogg +0 -0
prompts/zh-1.wav +0 -0
prompts/zh-2.wav +0 -0

app.py CHANGED Viewed

@@ -31,6 +31,7 @@ from models.vallex import VALLE
 from utils.g2p import PhonemeBpeTokenizer
 from descriptions import *
 from macros import *
 import gradio as gr
 import whisper
@@ -503,6 +504,11 @@ def main():
                     btn_mp.click(make_npz_prompt,
                                 inputs=[textbox_mp, upload_audio_prompt, record_audio_prompt, textbox_transcript],
                                 outputs=[text_output, prompt_output])
         with gr.Tab("Make prompt"):
             gr.Markdown(make_prompt_md)
             with gr.Row():
@@ -523,6 +529,11 @@ def main():
                     btn_2.click(make_npz_prompt,
                               inputs=[textbox2, upload_audio_prompt_2, record_audio_prompt_2, textbox_transcript2],
                               outputs=[text_output_2, prompt_output_2])
         with gr.Tab("Infer from prompt"):
             gr.Markdown(infer_from_prompt_md)
             with gr.Row():
@@ -543,8 +554,13 @@ def main():
                     btn_3.click(infer_from_prompt,
                               inputs=[textbox_3, language_dropdown_3, accent_dropdown_3, preset_dropdown_3, prompt_file],
                               outputs=[text_output_3, audio_output_3])
         with gr.Tab("Infer long text"):
-            gr.Markdown("This is a long text generation demo. You can use this to generate long audio. ")
             with gr.Row():
                 with gr.Column():
                     textbox_4 = gr.TextArea(label="Text",

 from utils.g2p import PhonemeBpeTokenizer
 from descriptions import *
 from macros import *
+from examples import *
 import gradio as gr
 import whisper
                     btn_mp.click(make_npz_prompt,
                                 inputs=[textbox_mp, upload_audio_prompt, record_audio_prompt, textbox_transcript],
                                 outputs=[text_output, prompt_output])
+            gr.Examples(examples=infer_from_audio_examples,
+                        inputs=[textbox, language_dropdown, accent_dropdown, upload_audio_prompt, record_audio_prompt, textbox_transcript],
+                        outputs=[text_output, audio_output],
+                        fn=infer_from_audio,
+                        cache_examples=True,)
         with gr.Tab("Make prompt"):
             gr.Markdown(make_prompt_md)
             with gr.Row():
                     btn_2.click(make_npz_prompt,
                               inputs=[textbox2, upload_audio_prompt_2, record_audio_prompt_2, textbox_transcript2],
                               outputs=[text_output_2, prompt_output_2])
+            gr.Examples(examples=make_npz_prompt_examples,
+                        inputs=[textbox2, upload_audio_prompt_2, record_audio_prompt_2, textbox_transcript2],
+                        outputs=[text_output_2, prompt_output_2],
+                        fn=make_npz_prompt,
+                        cache_examples=True,)
         with gr.Tab("Infer from prompt"):
             gr.Markdown(infer_from_prompt_md)
             with gr.Row():
                     btn_3.click(infer_from_prompt,
                               inputs=[textbox_3, language_dropdown_3, accent_dropdown_3, preset_dropdown_3, prompt_file],
                               outputs=[text_output_3, audio_output_3])
+            gr.Examples(examples=infer_from_prompt_examples,
+                        inputs=[textbox_3, language_dropdown_3, accent_dropdown_3, preset_dropdown_3, prompt_file],
+                        outputs=[text_output_3, audio_output_3],
+                        fn=infer_from_prompt,
+                        cache_examples=True,)
         with gr.Tab("Infer long text"):
+            gr.Markdown(long_text_md)
             with gr.Row():
                 with gr.Column():
                     textbox_4 = gr.TextArea(label="Text",

descriptions.py CHANGED Viewed

@@ -24,4 +24,9 @@ Faster than **"Infer from audio"**.<br>
 You need to **"Make prompt"** first, and upload the encoded prompt (a `.npz` file)
 """
 long_text_example = "Just a few years ago, there were no legions of deep learning scientists developing intelligent products and services at major companies and startups. When we entered the field, machine learning did not command headlines in daily newspapers. Our parents had no idea what machine learning was, let alone why we might prefer it to a career in medicine or law. Machine learning was a blue skies academic discipline whose industrial significance was limited to a narrow set of real-world applications, including speech recognition and computer vision. Moreover, many of these applications required so much domain knowledge that they were often regarded as entirely separate areas for which machine learning was one small component. At that time, neural networks—the predecessors of the deep learning methods that we focus on in this book—were generally regarded as outmoded."

 You need to **"Make prompt"** first, and upload the encoded prompt (a `.npz` file)
 """
+long_text_md = """
+Very long text is chunked into several sentences, and each sentence is synthesized separately.<br>
+Please make a prompt or use a preset prompt to infer long text.
+"""
 long_text_example = "Just a few years ago, there were no legions of deep learning scientists developing intelligent products and services at major companies and startups. When we entered the field, machine learning did not command headlines in daily newspapers. Our parents had no idea what machine learning was, let alone why we might prefer it to a career in medicine or law. Machine learning was a blue skies academic discipline whose industrial significance was limited to a narrow set of real-world applications, including speech recognition and computer vision. Moreover, many of these applications required so much domain knowledge that they were often regarded as entirely separate areas for which machine learning was one small component. At that time, neural networks—the predecessors of the deep learning methods that we focus on in this book—were generally regarded as outmoded."

examples.py ADDED Viewed

	@@ -0,0 +1,24 @@

+infer_from_audio_examples = [
+    ["This is how this machine has taken my voice.", 'English', 'no-accent', "prompts/en-2.wav", None, "Wow, look at that! That's no ordinary Teddy bear!"],
+    ["我喜欢抽电子烟，尤其是锐刻五代。", '中文', 'no-accent', "prompts/zh-1.wav", None, "今天我很荣幸，"],
+    ["私の声を真似するのはそんなに面白いですか？", '日本語', 'no-accent', "prompts/ja-2.ogg", None, "初めまして、朝武よしのです。"],
+    ["你可以听得出来我有多困。", '中文', 'no-accent', "prompts/en-1.wav", None, ""],
+    ["この文は、クロスリンガル合成の例です。", '日本語', 'no-accent', "prompts/zh-2.wav", None, ""],
+    ["Actually, I can't speak English, but this machine helped me do it.", 'English', 'no-accent', "prompts/ja-1.wav", None, ""],
+]
+make_npz_prompt_examples = [
+    ["Gem-trader", "prompts/en-2.wav", None, "Wow, look at that! That's no ordinary Teddy bear!"],
+    ["Ding Zhen", "prompts/zh-1.wav", None, "今天我很荣幸，"],
+    ["Yoshino", "prompts/ja-2.ogg", None, "初めまして、朝武よしのです。"],
+    ["Sleepy-woman", "prompts/en-1.wav", None, ""],
+    ["Yae", "prompts/zh-2.wav", None, ""],
+    ["Cafe", "prompts/ja-1.wav", None, ""],
+]
+infer_from_prompt_examples = [
+    ["A prompt contains voice, prosody and emotion information of a certain speaker.", "English", "no-accent", "vctk_1", None],
+    ["This prompt is made with an audio of three seconds.", "English", "no-accent", "librispeech_1", None],
+    ["This prompt is made with Chinese speech", "English", "no-accent", "seel", None],
+]

prompts/en-1.wav ADDED Viewed

Binary file (213 kB). View file

prompts/en-2.wav ADDED Viewed

Binary file (552 kB). View file

prompts/ja-1.wav ADDED Viewed

Binary file (195 kB). View file

prompts/ja-2.ogg ADDED Viewed

Binary file (31.4 kB). View file

prompts/zh-1.wav ADDED Viewed

Binary file (176 kB). View file

prompts/zh-2.wav ADDED Viewed

Binary file (272 kB). View file