basit123796 commited on
Commit
c95e552
1 Parent(s): 80b0f90

Upload 4 files

Browse files
Files changed (4) hide show
  1. app (1).py +138 -0
  2. dark-banner.png +0 -0
  3. light-banner.png +0 -0
  4. requirements (1).txt +3 -0
app (1).py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import gradio as gr
3
+ import io
4
+ import os
5
+ import re
6
+ import torch
7
+ import torchaudio
8
+ from pathlib import Path
9
+ from whisperspeech.pipeline import Pipeline
10
+
11
+ DEVEL=os.environ.get('DEVEL', False)
12
+
13
+ title = """
14
+ <picture>
15
+ <source srcset="https://huggingface.co/spaces/collabora/whisperspeech/resolve/main/dark-banner.png" media="(prefers-color-scheme: dark)" />
16
+ <img alt="WhisperSpeech banner with Collabora and LAION logos" src="https://huggingface.co/spaces/collabora/whisperspeech/resolve/main/light-banner.png" style="width: 60%; margin: 0 auto;" />
17
+ </picture>
18
+
19
+ # Welcome to Collabora's WhisperSpeech
20
+
21
+ WhisperSpeech is an Open Source text-to-speech system built by Collabora and LAION by inverting Whisper.
22
+ The model is fully open and you can run it on your local hardware. It's like **Stable Diffusion but for speech**
23
+ – both powerful and easily customizable.
24
+
25
+ [You can contribute to WhisperSpeech on Github.](https://github.com/collabora/WhisperSpeech)
26
+ You can also join the discussion on Discord [![](https://dcbadge.vercel.app/api/server/FANw4rHD5E)](https://discord.gg/FANw4rHD5E)
27
+
28
+ Huge thanks to [Tonic](https://huggingface.co/Tonic) who helped build this Space for WhisperSpeech.
29
+
30
+ ### How to Use It
31
+
32
+ Write you text in the box, you can use language tags (`<en>` or `<pl>`) to create multilingual speech.
33
+ Optionally you can upload a speech sample or give it a file URL to clone an existing voice. Check out the
34
+ examples at the bottom of the page for inspiration.
35
+ """
36
+
37
+ footer = """
38
+
39
+ ### How to use it locally
40
+
41
+ ```
42
+ pip install -U WhisperSpeech
43
+ ```
44
+
45
+ Afterwards:
46
+
47
+ ```
48
+ from whisperspeech.pipeline import Pipeline
49
+
50
+ pipe = Pipeline(torch_compile=True)
51
+ pipe.generate_to_file("output.wav", "Hello from WhisperSpeech.")
52
+ ```
53
+ """
54
+
55
+
56
+ text_examples = [
57
+ ["This is the first demo of Whisper Speech, a fully open source text-to-speech model trained by Collabora and Lion on the Juwels supercomputer.", None],
58
+ ["World War II or the Second World War was a global conflict that lasted from 1939 to 1945. The vast majority of the world's countries, including all the great powers, fought as part of two opposing military alliances: the Allies and the Axis.", "https://upload.wikimedia.org/wikipedia/commons/7/75/Winston_Churchill_-_Be_Ye_Men_of_Valour.ogg"],
59
+ ["<pl>To jest pierwszy test wielojęzycznego <en>Whisper Speech <pl>, modelu zamieniającego tekst na mowę, który Collabora i Laion nauczyli na superkomputerze <en>Jewels.", None],
60
+ ["<en> WhisperSpeech is an Open Source library that helps you convert text to speech. <pl>Teraz także po Polsku! <en>I think I just tried saying \"now also in Polish\", don't judge me...", None],
61
+ # ["<de> WhisperSpeech is multi-lingual <es> y puede cambiar de idioma <hi> मध्य वाक्य में"],
62
+ ["<pl>To jest pierwszy test naszego modelu. Pozdrawiamy serdecznie.", None],
63
+ # ["<en> The big difference between Europe <fr> et les Etats Unis <pl> jest to, że mamy tak wiele języków <uk> тут, в Європі"]
64
+ ]
65
+
66
+ def parse_multilingual_text(input_text):
67
+ pattern = r"(?:<(\w+)>)|([^<]+)"
68
+ cur_lang = 'en'
69
+ segments = []
70
+ for i, (lang, txt) in enumerate(re.findall(pattern, input_text)):
71
+ if lang: cur_lang = lang
72
+ else: segments.append((cur_lang, f" {txt} ")) # add spaces to give it some time to switch languages
73
+ if not segments: return [("en", "")]
74
+ return segments
75
+
76
+ @spaces.GPU(enable_queue=True)
77
+ def generate_audio(pipe, segments, speaker, speaker_url, cps=14):
78
+ if isinstance(speaker, (str, Path)): speaker = pipe.extract_spk_emb(speaker)
79
+ elif speaker_url: speaker = pipe.extract_spk_emb(speaker_url)
80
+ else: speaker = pipe.default_speaker
81
+ langs, texts = [list(x) for x in zip(*segments)]
82
+ print(texts, langs)
83
+ stoks = pipe.t2s.generate(texts, cps=cps, lang=langs)[0]
84
+ atoks = pipe.s2a.generate(stoks, speaker.unsqueeze(0))
85
+ audio = pipe.vocoder.decode(atoks)
86
+ return audio.cpu()
87
+
88
+ def whisper_speech_demo(multilingual_text, speaker_audio=None, speaker_url="", cps=14):
89
+ if len(multilingual_text) == 0:
90
+ raise gr.Error("Please enter some text for me to speak!")
91
+
92
+ segments = parse_multilingual_text(multilingual_text)
93
+
94
+ audio = generate_audio(pipe, segments, speaker_audio, speaker_url, cps)
95
+
96
+ return (24000, audio.T.numpy())
97
+
98
+ # Did not work for me in Safari:
99
+ # mp3 = io.BytesIO()
100
+ # torchaudio.save(mp3, audio, 24000, format='mp3')
101
+ # return mp3.getvalue()
102
+
103
+ pipe = Pipeline(torch_compile=not DEVEL)
104
+ # warmup will come from regenerating the examples
105
+
106
+ with gr.Blocks() as demo:
107
+ gr.Markdown(title)
108
+ with gr.Row(equal_height=True):
109
+ with gr.Column(scale=2):
110
+ text_input = gr.Textbox(label="Enter multilingual text💬📝",
111
+ value=text_examples[0][0],
112
+ info="You can use `<en>` for English and `<pl>` for Polish, see examples below.")
113
+ cps = gr.Slider(value=14, minimum=10, maximum=15, step=.25,
114
+ label="Tempo (in characters per second)")
115
+ with gr.Row(equal_height=True):
116
+ speaker_input = gr.Audio(label="Upload or Record Speaker Audio (optional)🌬️💬",
117
+ sources=["upload", "microphone"],
118
+ type='filepath')
119
+ url_input = gr.Textbox(label="alternatively, you can paste in an audio file URL:")
120
+ gr.Markdown(" \n ") # fixes the bottom overflow from Audio
121
+ generate_button = gr.Button("Try Collabora's WhisperSpeech🌟")
122
+ with gr.Column(scale=1):
123
+ output_audio = gr.Audio(label="WhisperSpeech says…")
124
+
125
+ with gr.Column():
126
+ gr.Markdown("### Try these examples to get started !🌟🌬️")
127
+ gr.Examples(
128
+ examples=text_examples,
129
+ inputs=[text_input, url_input],
130
+ outputs=[output_audio],
131
+ fn=whisper_speech_demo,
132
+ cache_examples=not DEVEL,
133
+ )
134
+
135
+ generate_button.click(whisper_speech_demo, inputs=[text_input, speaker_input, url_input, cps], outputs=output_audio)
136
+ gr.Markdown(footer)
137
+
138
+ demo.launch(server_port=3000 if DEVEL else None)
dark-banner.png ADDED
light-banner.png ADDED
requirements (1).txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ whisperspeech
2
+ gradio
3
+ spaces