Precompute examples, mnor layout adjustments
Browse files
app.py
CHANGED
@@ -8,6 +8,8 @@ import torchaudio
|
|
8 |
from pathlib import Path
|
9 |
from whisperspeech.pipeline import Pipeline
|
10 |
|
|
|
|
|
11 |
title = """# 🙋🏻♂️ Welcome to Collabora's WhisperSpeech
|
12 |
|
13 |
WhisperSpeech is an Open Source text-to-speech system built by Collabora and LAION by inverting Whisper.
|
@@ -77,7 +79,7 @@ def generate_audio(pipe, segments, speaker, speaker_url, cps=14):
|
|
77 |
audio = pipe.vocoder.decode(atoks)
|
78 |
return audio.cpu()
|
79 |
|
80 |
-
def whisper_speech_demo(multilingual_text, speaker_audio, speaker_url, cps):
|
81 |
if len(multilingual_text) == 0:
|
82 |
raise gr.Error("Please enter some text for me to speak!")
|
83 |
|
@@ -92,6 +94,9 @@ def whisper_speech_demo(multilingual_text, speaker_audio, speaker_url, cps):
|
|
92 |
# torchaudio.save(mp3, audio, 24000, format='mp3')
|
93 |
# return mp3.getvalue()
|
94 |
|
|
|
|
|
|
|
95 |
with gr.Blocks() as demo:
|
96 |
gr.Markdown(title)
|
97 |
with gr.Row(equal_height=True):
|
@@ -101,29 +106,27 @@ with gr.Blocks() as demo:
|
|
101 |
info="You can use `<en>` for English and `<pl>` for Polish, see examples below.")
|
102 |
cps = gr.Slider(value=14, minimum=10, maximum=15, step=.25,
|
103 |
label="Tempo (in characters per second)")
|
104 |
-
|
|
|
105 |
sources=["upload", "microphone"],
|
106 |
type='filepath')
|
|
|
107 |
gr.Markdown(" \n ") # fixes the bottom overflow from Audio
|
108 |
-
url_input = gr.Textbox(label="alternatively, you can paste in an audio file URL:")
|
109 |
generate_button = gr.Button("Try Collabora's WhisperSpeech🌟")
|
110 |
with gr.Column(scale=1):
|
111 |
output_audio = gr.Audio(label="WhisperSpeech says…")
|
112 |
|
113 |
-
with gr.
|
|
|
114 |
gr.Examples(
|
115 |
examples=text_examples,
|
116 |
inputs=[text_input, url_input],
|
117 |
outputs=[output_audio],
|
118 |
fn=whisper_speech_demo,
|
119 |
-
cache_examples=
|
120 |
-
label="Try these to get started !🌟🌬️"
|
121 |
)
|
122 |
|
123 |
generate_button.click(whisper_speech_demo, inputs=[text_input, speaker_input, url_input, cps], outputs=output_audio)
|
124 |
gr.Markdown(footer)
|
125 |
|
126 |
-
|
127 |
-
pipe.generate("WhisperSpeech warmup")
|
128 |
-
|
129 |
-
demo.launch()
|
|
|
8 |
from pathlib import Path
|
9 |
from whisperspeech.pipeline import Pipeline
|
10 |
|
11 |
+
DEVEL=os.environ.get('DEVEL', False)
|
12 |
+
|
13 |
title = """# 🙋🏻♂️ Welcome to Collabora's WhisperSpeech
|
14 |
|
15 |
WhisperSpeech is an Open Source text-to-speech system built by Collabora and LAION by inverting Whisper.
|
|
|
79 |
audio = pipe.vocoder.decode(atoks)
|
80 |
return audio.cpu()
|
81 |
|
82 |
+
def whisper_speech_demo(multilingual_text, speaker_audio=None, speaker_url="", cps=14):
|
83 |
if len(multilingual_text) == 0:
|
84 |
raise gr.Error("Please enter some text for me to speak!")
|
85 |
|
|
|
94 |
# torchaudio.save(mp3, audio, 24000, format='mp3')
|
95 |
# return mp3.getvalue()
|
96 |
|
97 |
+
pipe = Pipeline(torch_compile=not DEVEL)
|
98 |
+
# warmup will come from regenerating the examples
|
99 |
+
|
100 |
with gr.Blocks() as demo:
|
101 |
gr.Markdown(title)
|
102 |
with gr.Row(equal_height=True):
|
|
|
106 |
info="You can use `<en>` for English and `<pl>` for Polish, see examples below.")
|
107 |
cps = gr.Slider(value=14, minimum=10, maximum=15, step=.25,
|
108 |
label="Tempo (in characters per second)")
|
109 |
+
with gr.Row(equal_height=True):
|
110 |
+
speaker_input = gr.Audio(label="Upload or Record Speaker Audio (optional)🌬️💬",
|
111 |
sources=["upload", "microphone"],
|
112 |
type='filepath')
|
113 |
+
url_input = gr.Textbox(label="alternatively, you can paste in an audio file URL:")
|
114 |
gr.Markdown(" \n ") # fixes the bottom overflow from Audio
|
|
|
115 |
generate_button = gr.Button("Try Collabora's WhisperSpeech🌟")
|
116 |
with gr.Column(scale=1):
|
117 |
output_audio = gr.Audio(label="WhisperSpeech says…")
|
118 |
|
119 |
+
with gr.Column():
|
120 |
+
gr.Markdown("### Try these examples to get started !🌟🌬️")
|
121 |
gr.Examples(
|
122 |
examples=text_examples,
|
123 |
inputs=[text_input, url_input],
|
124 |
outputs=[output_audio],
|
125 |
fn=whisper_speech_demo,
|
126 |
+
cache_examples=not DEVEL,
|
|
|
127 |
)
|
128 |
|
129 |
generate_button.click(whisper_speech_demo, inputs=[text_input, speaker_input, url_input, cps], outputs=output_audio)
|
130 |
gr.Markdown(footer)
|
131 |
|
132 |
+
demo.launch(server_port=3000 if DEVEL else None)
|
|
|
|
|
|