Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -12,82 +12,71 @@ from whisperspeech.languages import LANGUAGES
|
|
12 |
from whisperspeech.pipeline import Pipeline
|
13 |
from whisperspeech.utils import resampler
|
14 |
|
15 |
-
title = """# ππ»ββοΈ Welcome toπ
|
16 |
|
17 |
-
You can use this ZeroGPU Space to test out the current model [π¬οΈπ¬πcollabora/whisperspeech](https://huggingface.co/collabora/whisperspeech). π¬οΈπ¬πcollabora/whisperspeech is An Open Source text-to-speech system built by inverting Whisper.
|
18 |
You can also use π¬οΈπ¬πWhisperSpeech by cloning this space. π§¬π¬π Simply click here: <a style="display:inline-block" href="https://huggingface.co/spaces/Tonic/laion-whisper?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14" alt="Duplicate Space"></a></h3>
|
19 |
-
|
|
|
|
|
|
|
|
|
20 |
"""
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
@spaces.GPU
|
23 |
-
def whisper_speech_demo(
|
24 |
-
|
|
|
|
|
|
|
25 |
pipe = Pipeline()
|
26 |
speaker_url = speaker_audio if speaker_audio is not None else None
|
27 |
-
|
28 |
-
if not lang:
|
29 |
-
raise ValueError("Language list is empty.")
|
30 |
-
lang = lang[0]
|
31 |
-
elif not isinstance(lang, str):
|
32 |
-
raise ValueError("Language is not specified correctly.")
|
33 |
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
stoks = pipe.t2s.generate(mixed_texts, lang=mixed_langs)
|
38 |
-
audio_data = pipe.generate(stoks, speaker_url, lang=mixed_langs[0])
|
39 |
-
else:
|
40 |
-
audio_data = pipe.generate(text, speaker_url, lang)
|
41 |
|
42 |
-
|
43 |
-
|
44 |
-
audio_np = audio_data_resampled.cpu().numpy()
|
45 |
-
audio_np = audio_np / np.max(np.abs(audio_np))
|
46 |
-
audio_np = np.asarray(audio_np, dtype=np.float32)
|
47 |
-
|
48 |
-
audio_stereo = np.stack((audio_np, audio_np), axis=-1)
|
49 |
audio_stereo = audio_stereo.reshape(-1, 2)
|
50 |
-
|
51 |
-
print("Audio Array Shape:", audio_stereo.shape)
|
52 |
-
print("Audio Array Dtype:", audio_stereo.dtype)
|
53 |
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
|
54 |
sf.write(tmp_file.name, audio_stereo, 24000, format='WAV', subtype='PCM_16')
|
55 |
-
|
|
|
56 |
|
57 |
with gr.Blocks() as demo:
|
58 |
gr.Markdown(title)
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
generate_button_standard = gr.Button("Generate Speech")
|
69 |
-
output_audio_standard = gr.Audio(label="π¬οΈπ¬πWhisperSpeech")
|
70 |
-
|
71 |
-
generate_button_standard.click(
|
72 |
-
whisper_speech_demo,
|
73 |
-
inputs=[text_input_standard, lang_input_standard, speaker_input_standard, placeholder_mix_lang, placeholder_mix_text],
|
74 |
-
outputs=output_audio_standard
|
75 |
-
)
|
76 |
-
|
77 |
-
with gr.TabItem("π¬οΈπ¬πMixed Language TTS"):
|
78 |
-
with gr.Row():
|
79 |
-
placeholder_text_input = gr.Textbox(visible=False)
|
80 |
-
placeholder_lang_input = gr.Dropdown(choices=[], visible=False)
|
81 |
-
placeholder_speaker_input = gr.Audio(visible=False)
|
82 |
-
mix_lang_input_mixed = gr.CheckboxGroup(choices=list(LANGUAGES.keys()), label="Select Languages")
|
83 |
-
mix_text_input_mixed = gr.Textbox(label="Enter mixed language text", placeholder="e.g., Hello, CzeΕΔ")
|
84 |
-
generate_button_mixed = gr.Button("Generate Mixed Speech")
|
85 |
-
output_audio_mixed = gr.Audio(label="Mixedπ¬οΈπ¬πWhisperSpeech")
|
86 |
-
|
87 |
-
generate_button_mixed.click(
|
88 |
-
whisper_speech_demo,
|
89 |
-
inputs=[placeholder_text_input, placeholder_lang_input, placeholder_speaker_input, mix_lang_input_mixed, mix_text_input_mixed],
|
90 |
-
outputs=output_audio_mixed
|
91 |
-
)
|
92 |
|
93 |
demo.launch()
|
|
|
12 |
from whisperspeech.pipeline import Pipeline
|
13 |
from whisperspeech.utils import resampler
|
14 |
|
15 |
+
title = """# ππ»ββοΈ Welcome toπCollaboraπ¬οΈπ¬πWhisperSpeech
|
16 |
|
17 |
+
You can use this ZeroGPU Space to test out the current model [π¬οΈπ¬πcollabora/whisperspeech](https://huggingface.co/collabora/whisperspeech). π¬οΈπ¬πcollabora/whisperspeech is An Open Source text-to-speech system built by inverting Whisper. Install it and use your command line interface locally with `pip install whisperspeech`. It's like Stable Diffusion but for speech β both powerful and easily customizable : so you can use it programmatically in your own pipelines! [Contribute to whisperspeech here](https://github.com/collabora/WhisperSpeech)
|
18 |
You can also use π¬οΈπ¬πWhisperSpeech by cloning this space. π§¬π¬π Simply click here: <a style="display:inline-block" href="https://huggingface.co/spaces/Tonic/laion-whisper?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14" alt="Duplicate Space"></a></h3>
|
19 |
+
|
20 |
+
We're **celebrating the release of the whisperspeech** at [the LAION community, if you love open source ai learn more here : https://laion.ai/](https://laion.ai/) big thanks to the folks at huggingface for the community grant π€
|
21 |
+
|
22 |
+
### How to Use
|
23 |
+
Input text with the language identifiers provided to create a multilingual speech. Optionally you can add an audiosample to make a voice print. Scroll down and try the api <3 Gradio.
|
24 |
"""
|
25 |
|
26 |
+
# Function to parse the multilingual input text
|
27 |
+
def parse_multilingual_text(input_text):
|
28 |
+
pattern = r"<(\w+)>\s(.*?)\s(?=<\w+>|$)"
|
29 |
+
segments = re.findall(pattern, input_text)
|
30 |
+
return [(lang, text.strip()) for lang, text in segments if lang in LANGUAGES.keys()]
|
31 |
+
|
32 |
+
# Function to generate audio for each language segment
|
33 |
+
def generate_segment_audio(text, lang, speaker_url, pipe):
|
34 |
+
stoks = pipe.t2s.generate([text], lang=[lang])
|
35 |
+
audio_data = pipe.generate(stoks, speaker_url, lang)
|
36 |
+
resample_audio = resampler(newsr=24000)
|
37 |
+
audio_data_resampled = next(resample_audio([{'sample_rate': 24000, 'samples': audio_data.cpu()}]))['samples_24k']
|
38 |
+
audio_np = audio_data_resampled.cpu().numpy()
|
39 |
+
return audio_np
|
40 |
+
|
41 |
+
# Function to concatenate audio segments
|
42 |
+
def concatenate_audio_segments(segments):
|
43 |
+
concatenated_audio = np.concatenate(segments, axis=0)
|
44 |
+
concatenated_audio = concatenated_audio / np.max(np.abs(concatenated_audio))
|
45 |
+
return np.asarray(concatenated_audio, dtype=np.float32)
|
46 |
+
|
47 |
@spaces.GPU
|
48 |
+
def whisper_speech_demo(multilingual_text, speaker_audio):
|
49 |
+
segments = parse_multilingual_text(multilingual_text)
|
50 |
+
if not segments:
|
51 |
+
return None, "No valid language segments found. Please use the format: <lang> text"
|
52 |
+
|
53 |
pipe = Pipeline()
|
54 |
speaker_url = speaker_audio if speaker_audio is not None else None
|
55 |
+
audio_segments = []
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
+
for lang, text in segments:
|
58 |
+
audio_np = generate_segment_audio(text, lang, speaker_url, pipe)
|
59 |
+
audio_segments.append(audio_np)
|
|
|
|
|
|
|
|
|
60 |
|
61 |
+
concatenated_audio = concatenate_audio_segments(audio_segments)
|
62 |
+
audio_stereo = np.stack((concatenated_audio, concatenated_audio), axis=-1)
|
|
|
|
|
|
|
|
|
|
|
63 |
audio_stereo = audio_stereo.reshape(-1, 2)
|
64 |
+
|
|
|
|
|
65 |
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
|
66 |
sf.write(tmp_file.name, audio_stereo, 24000, format='WAV', subtype='PCM_16')
|
67 |
+
return tmp_file.name
|
68 |
+
|
69 |
|
70 |
with gr.Blocks() as demo:
|
71 |
gr.Markdown(title)
|
72 |
+
output_audio = gr.Audio(label="Generated Speech")
|
73 |
+
generate_button = gr.Button("Try πCollaboraπ¬οΈπ¬πWhisperSpeech")
|
74 |
+
with gr.Row():
|
75 |
+
text_input = gr.Textbox(label="Enter multilingual text", placeholder="e.g., <en> Hello <fr> Bonjour <es> Hola", examples=["<en> Hello, how are you? <fr> Bonjour, comment Γ§a va?", "<de> Guten Tag <it> Buongiorno <jp> γγγ«γ‘γ―"])
|
76 |
+
speaker_input = gr.Audio(label="Upload or Record Speaker Audio (optional)", sources=["upload", "microphone"], type="filepath", examples=["path/to/tonic.wav"])
|
77 |
+
with gr.Accordion("Available Languages and Their Tags"):
|
78 |
+
language_list = "\n".join([f"{lang}: {LANGUAGES[lang]}" for lang in LANGUAGES])
|
79 |
+
gr.Markdown(language_list)
|
80 |
+
generate_button.click(whisper_speech_demo, inputs=[text_input, speaker_input], outputs=output_audio)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
demo.launch()
|