jpc commited on
Commit
a68da21
1 Parent(s): f7b03d4

Simplified the UI and the generation code

Browse files

Added more examples and support for a URL-based voice cloning sample.

Files changed (1) hide show
  1. app.py +61 -72
app.py CHANGED
@@ -1,16 +1,12 @@
1
  import spaces
2
- import tempfile
3
- import wave
4
  import gradio as gr
 
5
  import os
6
- import re
7
  import torch
8
- import soundfile as sf
9
- import numpy as np
10
- import torch.nn.functional as F
11
  from whisperspeech.pipeline import Pipeline
12
- from whisperspeech.languages import LANGUAGES
13
- from whisperspeech.utils import resampler
14
 
15
  title = """# 🙋🏻‍♂️ Welcome to🌟Collabora🌬️💬📝WhisperSpeech
16
 
@@ -26,9 +22,13 @@ This space runs on ZeroGPU, so **you need to be patient** while you acquire the
26
 
27
 
28
  text_examples = [
29
- ["<en> WhisperSpeech is an opensource library that helps you hack whisper."],
30
- ["<de> WhisperSpeech is multi-lingual <es> y puede cambiar de idioma <hi> मध्य वाक्य में"],
31
- ["<en> The big difference between Europe <fr> et les Etats Unis <pl> jest to, że mamy tak wiele języków <uk> тут, в Європі"]
 
 
 
 
32
  ]
33
 
34
  def parse_multilingual_text(input_text):
@@ -42,74 +42,63 @@ def parse_multilingual_text(input_text):
42
  return segments
43
 
44
  @spaces.GPU(enable_queue=True)
45
- def generate_segment_audio(text, lang, speaker_audio, pipe):
46
- if not isinstance(text, str):
47
- text = text.decode("utf-8") if isinstance(text, bytes) else str(text)
48
- speaker_audio_data = speaker_audio
49
- audio_data = pipe.generate(text, speaker_audio_data, lang)
50
- resample_audio = resampler(newsr=24000)
51
- audio_data_resampled = next(resample_audio([{'sample_rate': 24000, 'samples': audio_data.cpu()}]))['samples_24k']
52
- audio_np = audio_data_resampled.cpu().numpy()
53
- # Debug statement print("Shape after resampling:", audio_np.shape)
54
- return audio_np
55
-
56
- def concatenate_audio_segments(segments):
57
- concatenated_audio = np.concatenate(segments , axis=1)
58
- return concatenated_audio
59
 
60
-
61
- @spaces.GPU(enable_queue=True)
62
- def whisper_speech_demo(multilingual_text, speaker_audio):
63
  segments = parse_multilingual_text(multilingual_text)
64
- if not segments:
65
- return None, "No valid language segments found. Please use the format: <lang> text"
66
-
67
- pipe = Pipeline()
68
- if not hasattr(pipe, 's2a'):
69
- return None, "Pipeline initialization failed. s2a model not loaded."
70
-
71
- speaker_url = speaker_audio if speaker_audio is not None else None
72
- audio_segments = []
73
-
74
- for lang, text in segments:
75
- text_str = text if isinstance(text, str) else str(text)
76
- audio_np = generate_segment_audio(text_str, lang, speaker_url, pipe)
77
- # Debug statement print("Audio segment shape:", audio_np.shape)
78
- audio_segments.append(audio_np)
79
 
80
- concatenated_audio = concatenate_audio_segments(audio_segments)
81
- # Debug statement print("Final concatenated audio shape:", concatenated_audio.shape)
82
- concatenated_audio = concatenated_audio / np.max(np.abs(concatenated_audio))
83
 
84
- return (24000, concatenated_audio.T)
85
 
 
 
 
 
86
 
87
  with gr.Blocks() as demo:
88
  gr.Markdown(title)
89
- output_audio = gr.Audio(label="🌟Collabora🌬️💬📝WhisperSpeech")
90
- generate_button = gr.Button("Try 🌟Collabora🌬️💬📝WhisperSpeech")
91
- with gr.Accordion("🌟Collabora🌬️WhisperSpeech💬Voice Print and📝Language List", open=False):
92
- with gr.Row():
 
 
 
93
  speaker_input = gr.Audio(label="Upload or Record Speaker Audio (optional)🌬️💬",
94
- sources=["upload", "microphone"])
95
- with gr.Row():
96
- with gr.Accordion("Available Languages and Their Tags", open=False):
97
- formatted_language_list = "\n".join([f"`<{lang}>` {LANGUAGES[lang]}" for lang in LANGUAGES])
98
- gr.Markdown(formatted_language_list)
99
- with gr.Row():
100
- text_input = gr.Textbox(label="Enter multilingual text💬📝",
101
- placeholder="e.g., <en> Hello <fr> Bonjour <es> Hola")
102
  with gr.Row():
103
- with gr.Accordion("Try Multilingual Text Examples", open=False):
104
- gr.Examples(
105
- examples=text_examples,
106
- inputs=[text_input],
107
- outputs=[output_audio],
108
- fn=whisper_speech_demo,
109
- cache_examples=False,
110
- label="Try these to get started !🌟🌬️"
111
- )
112
-
113
- generate_button.click(whisper_speech_demo, inputs=[text_input, speaker_input], outputs=output_audio)
114
-
115
- demo.launch()
 
 
 
1
  import spaces
 
 
2
  import gradio as gr
3
+ import io
4
  import os
5
+ import re
6
  import torch
7
+ import torchaudio
8
+ from pathlib import Path
 
9
  from whisperspeech.pipeline import Pipeline
 
 
10
 
11
  title = """# 🙋🏻‍♂️ Welcome to🌟Collabora🌬️💬📝WhisperSpeech
12
 
 
22
 
23
 
24
  text_examples = [
25
+ ["This is the first demo of Whisper Speech, a fully open source text-to-speech model trained by Collabora and Lion on the Juwels supercomputer.", None],
26
+ ["World War II or the Second World War was a global conflict that lasted from 1939 to 1945. The vast majority of the world's countries, including all the great powers, fought as part of two opposing military alliances: the Allies and the Axis.", "https://upload.wikimedia.org/wikipedia/commons/7/75/Winston_Churchill_-_Be_Ye_Men_of_Valour.ogg"],
27
+ ["<pl>To jest pierwszy test wielojęzycznego <en>Whisper Speech <pl>, modelu zamieniającego tekst na mowę, który Collabora i Laion nauczyli na superkomputerze <en>Jewels.", None],
28
+ ["<en> WhisperSpeech is an Open Source library that helps you convert text to speech. <pl>Teraz także po Polsku! <en>I think I just tried saying \"now also in Polish\", don't judge me...", None],
29
+ # ["<de> WhisperSpeech is multi-lingual <es> y puede cambiar de idioma <hi> मध्य वाक्य में"],
30
+ ["<pl>To jest pierwszy test naszego modelu. Pozdrawiamy serdecznie.", None],
31
+ # ["<en> The big difference between Europe <fr> et les Etats Unis <pl> jest to, że mamy tak wiele języków <uk> тут, в Європі"]
32
  ]
33
 
34
  def parse_multilingual_text(input_text):
 
42
  return segments
43
 
44
  @spaces.GPU(enable_queue=True)
45
+ def generate_audio(pipe, segments, speaker, speaker_url, cps=14):
46
+ if isinstance(speaker, (str, Path)): speaker = pipe.extract_spk_emb(speaker)
47
+ elif speaker_url: speaker = pipe.extract_spk_emb(speaker_url)
48
+ else: speaker = pipe.default_speaker
49
+ langs, texts = [list(x) for x in zip(*segments)]
50
+ print(texts, langs)
51
+ stoks = pipe.t2s.generate(texts, cps=cps, lang=langs)[0]
52
+ atoks = pipe.s2a.generate(stoks, speaker.unsqueeze(0))
53
+ audio = pipe.vocoder.decode(atoks)
54
+ return audio.cpu()
55
+
56
+ def whisper_speech_demo(multilingual_text, speaker_audio, speaker_url, cps):
57
+ if len(multilingual_text) == 0:
58
+ raise gr.Error("Please enter some text for me to speak!")
59
 
 
 
 
60
  segments = parse_multilingual_text(multilingual_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
+ audio = generate_audio(pipe, segments, speaker_audio, speaker_url, cps)
 
 
63
 
64
+ return (24000, audio.T.numpy())
65
 
66
+ # Did not work for me in Safari:
67
+ # mp3 = io.BytesIO()
68
+ # torchaudio.save(mp3, audio, 24000, format='mp3')
69
+ # return mp3.getvalue()
70
 
71
  with gr.Blocks() as demo:
72
  gr.Markdown(title)
73
+ with gr.Row(equal_height=True):
74
+ with gr.Column(scale=2):
75
+ text_input = gr.Textbox(label="Enter multilingual text💬📝",
76
+ value=text_examples[0][0],
77
+ info="You can use `<en>` for English and `<pl>` for Polish, see examples below.")
78
+ cps = gr.Slider(value=14, minimum=10, maximum=15, step=.25,
79
+ label="Tempo (in characters per second)")
80
  speaker_input = gr.Audio(label="Upload or Record Speaker Audio (optional)🌬️💬",
81
+ sources=["upload", "microphone"],
82
+ type='filepath')
83
+ gr.Markdown(" \n ") # fixes the bottom overflow from Audio
84
+ url_input = gr.Textbox(label="alternatively, you can paste in an audio file URL:")
85
+ generate_button = gr.Button("Try Collabora's WhisperSpeech🌟")
86
+ with gr.Column(scale=1):
87
+ output_audio = gr.Audio(label="WhisperSpeech says…")
88
+
89
  with gr.Row():
90
+ gr.Examples(
91
+ examples=text_examples,
92
+ inputs=[text_input, url_input],
93
+ outputs=[output_audio],
94
+ fn=whisper_speech_demo,
95
+ cache_examples=False,
96
+ label="Try these to get started !🌟🌬️"
97
+ )
98
+
99
+ generate_button.click(whisper_speech_demo, inputs=[text_input, speaker_input, url_input, cps], outputs=output_audio)
100
+
101
+ pipe = Pipeline()#torch_compile=True)
102
+ pipe.generate("WhisperSpeech warmup")
103
+
104
+ demo.launch(server_port=3000)#, share=True)