Simplified the UI and the generation code
Browse filesAdded more examples and support for a URL-based voice cloning sample.
app.py
CHANGED
@@ -1,16 +1,12 @@
|
|
1 |
import spaces
|
2 |
-
import tempfile
|
3 |
-
import wave
|
4 |
import gradio as gr
|
|
|
5 |
import os
|
6 |
-
import re
|
7 |
import torch
|
8 |
-
import
|
9 |
-
|
10 |
-
import torch.nn.functional as F
|
11 |
from whisperspeech.pipeline import Pipeline
|
12 |
-
from whisperspeech.languages import LANGUAGES
|
13 |
-
from whisperspeech.utils import resampler
|
14 |
|
15 |
title = """# 🙋🏻♂️ Welcome to🌟Collabora🌬️💬📝WhisperSpeech
|
16 |
|
@@ -26,9 +22,13 @@ This space runs on ZeroGPU, so **you need to be patient** while you acquire the
|
|
26 |
|
27 |
|
28 |
text_examples = [
|
29 |
-
["
|
30 |
-
["
|
31 |
-
["<
|
|
|
|
|
|
|
|
|
32 |
]
|
33 |
|
34 |
def parse_multilingual_text(input_text):
|
@@ -42,74 +42,63 @@ def parse_multilingual_text(input_text):
|
|
42 |
return segments
|
43 |
|
44 |
@spaces.GPU(enable_queue=True)
|
45 |
-
def
|
46 |
-
if
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
return
|
55 |
-
|
56 |
-
def
|
57 |
-
|
58 |
-
|
59 |
|
60 |
-
|
61 |
-
@spaces.GPU(enable_queue=True)
|
62 |
-
def whisper_speech_demo(multilingual_text, speaker_audio):
|
63 |
segments = parse_multilingual_text(multilingual_text)
|
64 |
-
if not segments:
|
65 |
-
return None, "No valid language segments found. Please use the format: <lang> text"
|
66 |
-
|
67 |
-
pipe = Pipeline()
|
68 |
-
if not hasattr(pipe, 's2a'):
|
69 |
-
return None, "Pipeline initialization failed. s2a model not loaded."
|
70 |
-
|
71 |
-
speaker_url = speaker_audio if speaker_audio is not None else None
|
72 |
-
audio_segments = []
|
73 |
-
|
74 |
-
for lang, text in segments:
|
75 |
-
text_str = text if isinstance(text, str) else str(text)
|
76 |
-
audio_np = generate_segment_audio(text_str, lang, speaker_url, pipe)
|
77 |
-
# Debug statement print("Audio segment shape:", audio_np.shape)
|
78 |
-
audio_segments.append(audio_np)
|
79 |
|
80 |
-
|
81 |
-
# Debug statement print("Final concatenated audio shape:", concatenated_audio.shape)
|
82 |
-
concatenated_audio = concatenated_audio / np.max(np.abs(concatenated_audio))
|
83 |
|
84 |
-
return (24000,
|
85 |
|
|
|
|
|
|
|
|
|
86 |
|
87 |
with gr.Blocks() as demo:
|
88 |
gr.Markdown(title)
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
|
|
|
|
|
|
93 |
speaker_input = gr.Audio(label="Upload or Record Speaker Audio (optional)🌬️💬",
|
94 |
-
sources=["upload", "microphone"]
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
with gr.Row():
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
|
|
|
|
|
1 |
import spaces
|
|
|
|
|
2 |
import gradio as gr
|
3 |
+
import io
|
4 |
import os
|
5 |
+
import re
|
6 |
import torch
|
7 |
+
import torchaudio
|
8 |
+
from pathlib import Path
|
|
|
9 |
from whisperspeech.pipeline import Pipeline
|
|
|
|
|
10 |
|
11 |
title = """# 🙋🏻♂️ Welcome to🌟Collabora🌬️💬📝WhisperSpeech
|
12 |
|
|
|
22 |
|
23 |
|
24 |
text_examples = [
|
25 |
+
["This is the first demo of Whisper Speech, a fully open source text-to-speech model trained by Collabora and Lion on the Juwels supercomputer.", None],
|
26 |
+
["World War II or the Second World War was a global conflict that lasted from 1939 to 1945. The vast majority of the world's countries, including all the great powers, fought as part of two opposing military alliances: the Allies and the Axis.", "https://upload.wikimedia.org/wikipedia/commons/7/75/Winston_Churchill_-_Be_Ye_Men_of_Valour.ogg"],
|
27 |
+
["<pl>To jest pierwszy test wielojęzycznego <en>Whisper Speech <pl>, modelu zamieniającego tekst na mowę, który Collabora i Laion nauczyli na superkomputerze <en>Jewels.", None],
|
28 |
+
["<en> WhisperSpeech is an Open Source library that helps you convert text to speech. <pl>Teraz także po Polsku! <en>I think I just tried saying \"now also in Polish\", don't judge me...", None],
|
29 |
+
# ["<de> WhisperSpeech is multi-lingual <es> y puede cambiar de idioma <hi> मध्य वाक्य में"],
|
30 |
+
["<pl>To jest pierwszy test naszego modelu. Pozdrawiamy serdecznie.", None],
|
31 |
+
# ["<en> The big difference between Europe <fr> et les Etats Unis <pl> jest to, że mamy tak wiele języków <uk> тут, в Європі"]
|
32 |
]
|
33 |
|
34 |
def parse_multilingual_text(input_text):
|
|
|
42 |
return segments
|
43 |
|
44 |
@spaces.GPU(enable_queue=True)
|
45 |
+
def generate_audio(pipe, segments, speaker, speaker_url, cps=14):
|
46 |
+
if isinstance(speaker, (str, Path)): speaker = pipe.extract_spk_emb(speaker)
|
47 |
+
elif speaker_url: speaker = pipe.extract_spk_emb(speaker_url)
|
48 |
+
else: speaker = pipe.default_speaker
|
49 |
+
langs, texts = [list(x) for x in zip(*segments)]
|
50 |
+
print(texts, langs)
|
51 |
+
stoks = pipe.t2s.generate(texts, cps=cps, lang=langs)[0]
|
52 |
+
atoks = pipe.s2a.generate(stoks, speaker.unsqueeze(0))
|
53 |
+
audio = pipe.vocoder.decode(atoks)
|
54 |
+
return audio.cpu()
|
55 |
+
|
56 |
+
def whisper_speech_demo(multilingual_text, speaker_audio, speaker_url, cps):
|
57 |
+
if len(multilingual_text) == 0:
|
58 |
+
raise gr.Error("Please enter some text for me to speak!")
|
59 |
|
|
|
|
|
|
|
60 |
segments = parse_multilingual_text(multilingual_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
+
audio = generate_audio(pipe, segments, speaker_audio, speaker_url, cps)
|
|
|
|
|
63 |
|
64 |
+
return (24000, audio.T.numpy())
|
65 |
|
66 |
+
# Did not work for me in Safari:
|
67 |
+
# mp3 = io.BytesIO()
|
68 |
+
# torchaudio.save(mp3, audio, 24000, format='mp3')
|
69 |
+
# return mp3.getvalue()
|
70 |
|
71 |
with gr.Blocks() as demo:
|
72 |
gr.Markdown(title)
|
73 |
+
with gr.Row(equal_height=True):
|
74 |
+
with gr.Column(scale=2):
|
75 |
+
text_input = gr.Textbox(label="Enter multilingual text💬📝",
|
76 |
+
value=text_examples[0][0],
|
77 |
+
info="You can use `<en>` for English and `<pl>` for Polish, see examples below.")
|
78 |
+
cps = gr.Slider(value=14, minimum=10, maximum=15, step=.25,
|
79 |
+
label="Tempo (in characters per second)")
|
80 |
speaker_input = gr.Audio(label="Upload or Record Speaker Audio (optional)🌬️💬",
|
81 |
+
sources=["upload", "microphone"],
|
82 |
+
type='filepath')
|
83 |
+
gr.Markdown(" \n ") # fixes the bottom overflow from Audio
|
84 |
+
url_input = gr.Textbox(label="alternatively, you can paste in an audio file URL:")
|
85 |
+
generate_button = gr.Button("Try Collabora's WhisperSpeech🌟")
|
86 |
+
with gr.Column(scale=1):
|
87 |
+
output_audio = gr.Audio(label="WhisperSpeech says…")
|
88 |
+
|
89 |
with gr.Row():
|
90 |
+
gr.Examples(
|
91 |
+
examples=text_examples,
|
92 |
+
inputs=[text_input, url_input],
|
93 |
+
outputs=[output_audio],
|
94 |
+
fn=whisper_speech_demo,
|
95 |
+
cache_examples=False,
|
96 |
+
label="Try these to get started !🌟🌬️"
|
97 |
+
)
|
98 |
+
|
99 |
+
generate_button.click(whisper_speech_demo, inputs=[text_input, speaker_input, url_input, cps], outputs=output_audio)
|
100 |
+
|
101 |
+
pipe = Pipeline()#torch_compile=True)
|
102 |
+
pipe.generate("WhisperSpeech warmup")
|
103 |
+
|
104 |
+
demo.launch(server_port=3000)#, share=True)
|