Tonic commited on
Commit
c4d7f81
β€’
1 Parent(s): fa57d13

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -62
app.py CHANGED
@@ -12,82 +12,71 @@ from whisperspeech.languages import LANGUAGES
12
  from whisperspeech.pipeline import Pipeline
13
  from whisperspeech.utils import resampler
14
 
15
- title = """# πŸ™‹πŸ»β€β™‚οΈ Welcome to🌟Tonic'sπŸŒ¬οΈπŸ’¬πŸ“WhisperSpeech
16
 
17
- You can use this ZeroGPU Space to test out the current model [πŸŒ¬οΈπŸ’¬πŸ“collabora/whisperspeech](https://huggingface.co/collabora/whisperspeech). πŸŒ¬οΈπŸ’¬πŸ“collabora/whisperspeech is An Open Source text-to-speech system built by inverting Whisper. Previously known as spear-tts-pytorch. It's like Stable Diffusion but for speech – both powerful and easily customizable.
18
  You can also use πŸŒ¬οΈπŸ’¬πŸ“WhisperSpeech by cloning this space. πŸ§¬πŸ”¬πŸ” Simply click here: <a style="display:inline-block" href="https://huggingface.co/spaces/Tonic/laion-whisper?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=&logoWidth=14" alt="Duplicate Space"></a></h3>
19
- Join us : 🌟TeamTonic🌟 is always making cool demos! Join our active builder'sπŸ› οΈcommunity πŸ‘» [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/GWpVpekp) On πŸ€—Huggingface: [TeamTonic](https://huggingface.co/TeamTonic) & [MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Polytonic](https://github.com/tonic-ai) & contribute to 🌟 [Poly](https://github.com/tonic-ai/poly) πŸ€—Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant πŸ€—
 
 
 
 
20
  """
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  @spaces.GPU
23
- def whisper_speech_demo(text, lang, speaker_audio, mix_lang, mix_text):
24
- print(f"Text: {text}, Lang: {lang}, Speaker Audio: {speaker_audio}, Mix Lang: {mix_lang}, Mix Text: {mix_text}")
 
 
 
25
  pipe = Pipeline()
26
  speaker_url = speaker_audio if speaker_audio is not None else None
27
- if isinstance(lang, list):
28
- if not lang:
29
- raise ValueError("Language list is empty.")
30
- lang = lang[0]
31
- elif not isinstance(lang, str):
32
- raise ValueError("Language is not specified correctly.")
33
 
34
- if mix_lang and mix_text:
35
- mixed_langs = mix_lang.split(',') if isinstance(mix_lang, str) else mix_lang
36
- mixed_texts = mix_text.split(',')
37
- stoks = pipe.t2s.generate(mixed_texts, lang=mixed_langs)
38
- audio_data = pipe.generate(stoks, speaker_url, lang=mixed_langs[0])
39
- else:
40
- audio_data = pipe.generate(text, speaker_url, lang)
41
 
42
- resample_audio = resampler(newsr=24000)
43
- audio_data_resampled = next(resample_audio([{'sample_rate': 24000, 'samples': audio_data.cpu()}]))['samples_24k']
44
- audio_np = audio_data_resampled.cpu().numpy()
45
- audio_np = audio_np / np.max(np.abs(audio_np))
46
- audio_np = np.asarray(audio_np, dtype=np.float32)
47
-
48
- audio_stereo = np.stack((audio_np, audio_np), axis=-1)
49
  audio_stereo = audio_stereo.reshape(-1, 2)
50
-
51
- print("Audio Array Shape:", audio_stereo.shape)
52
- print("Audio Array Dtype:", audio_stereo.dtype)
53
  with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
54
  sf.write(tmp_file.name, audio_stereo, 24000, format='WAV', subtype='PCM_16')
55
- return tmp_file.name
 
56
 
57
  with gr.Blocks() as demo:
58
  gr.Markdown(title)
59
-
60
- with gr.Tabs():
61
- with gr.TabItem("πŸŒ¬οΈπŸ’¬πŸ“Standard TTS"):
62
- with gr.Row():
63
- text_input_standard = gr.Textbox(label="Enter text")
64
- lang_input_standard = gr.Dropdown(choices=list(LANGUAGES.keys()), label="Language")
65
- speaker_input_standard = gr.Audio(label="Upload or Record Speaker Audio (optional)", sources=["upload", "microphone"], type="filepath")
66
- placeholder_mix_lang = gr.Textbox(visible=False)
67
- placeholder_mix_text = gr.Textbox(visible=False)
68
- generate_button_standard = gr.Button("Generate Speech")
69
- output_audio_standard = gr.Audio(label="πŸŒ¬οΈπŸ’¬πŸ“WhisperSpeech")
70
-
71
- generate_button_standard.click(
72
- whisper_speech_demo,
73
- inputs=[text_input_standard, lang_input_standard, speaker_input_standard, placeholder_mix_lang, placeholder_mix_text],
74
- outputs=output_audio_standard
75
- )
76
-
77
- with gr.TabItem("πŸŒ¬οΈπŸ’¬πŸ“Mixed Language TTS"):
78
- with gr.Row():
79
- placeholder_text_input = gr.Textbox(visible=False)
80
- placeholder_lang_input = gr.Dropdown(choices=[], visible=False)
81
- placeholder_speaker_input = gr.Audio(visible=False)
82
- mix_lang_input_mixed = gr.CheckboxGroup(choices=list(LANGUAGES.keys()), label="Select Languages")
83
- mix_text_input_mixed = gr.Textbox(label="Enter mixed language text", placeholder="e.g., Hello, CzeΕ›Δ‡")
84
- generate_button_mixed = gr.Button("Generate Mixed Speech")
85
- output_audio_mixed = gr.Audio(label="MixedπŸŒ¬οΈπŸ’¬πŸ“WhisperSpeech")
86
-
87
- generate_button_mixed.click(
88
- whisper_speech_demo,
89
- inputs=[placeholder_text_input, placeholder_lang_input, placeholder_speaker_input, mix_lang_input_mixed, mix_text_input_mixed],
90
- outputs=output_audio_mixed
91
- )
92
 
93
  demo.launch()
 
12
  from whisperspeech.pipeline import Pipeline
13
  from whisperspeech.utils import resampler
14
 
15
+ title = """# πŸ™‹πŸ»β€β™‚οΈ Welcome to🌟CollaboraπŸŒ¬οΈπŸ’¬πŸ“WhisperSpeech
16
 
17
+ You can use this ZeroGPU Space to test out the current model [πŸŒ¬οΈπŸ’¬πŸ“collabora/whisperspeech](https://huggingface.co/collabora/whisperspeech). πŸŒ¬οΈπŸ’¬πŸ“collabora/whisperspeech is An Open Source text-to-speech system built by inverting Whisper. Install it and use your command line interface locally with `pip install whisperspeech`. It's like Stable Diffusion but for speech – both powerful and easily customizable : so you can use it programmatically in your own pipelines! [Contribute to whisperspeech here](https://github.com/collabora/WhisperSpeech)
18
  You can also use πŸŒ¬οΈπŸ’¬πŸ“WhisperSpeech by cloning this space. πŸ§¬πŸ”¬πŸ” Simply click here: <a style="display:inline-block" href="https://huggingface.co/spaces/Tonic/laion-whisper?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=&logoWidth=14" alt="Duplicate Space"></a></h3>
19
+
20
+ We're **celebrating the release of the whisperspeech** at [the LAION community, if you love open source ai learn more here : https://laion.ai/](https://laion.ai/) big thanks to the folks at huggingface for the community grant πŸ€—
21
+
22
+ ### How to Use
23
+ Input text with the language identifiers provided to create a multilingual speech. Optionally you can add an audiosample to make a voice print. Scroll down and try the api <3 Gradio.
24
  """
25
 
26
+ # Function to parse the multilingual input text
27
+ def parse_multilingual_text(input_text):
28
+ pattern = r"<(\w+)>\s(.*?)\s(?=<\w+>|$)"
29
+ segments = re.findall(pattern, input_text)
30
+ return [(lang, text.strip()) for lang, text in segments if lang in LANGUAGES.keys()]
31
+
32
+ # Function to generate audio for each language segment
33
+ def generate_segment_audio(text, lang, speaker_url, pipe):
34
+ stoks = pipe.t2s.generate([text], lang=[lang])
35
+ audio_data = pipe.generate(stoks, speaker_url, lang)
36
+ resample_audio = resampler(newsr=24000)
37
+ audio_data_resampled = next(resample_audio([{'sample_rate': 24000, 'samples': audio_data.cpu()}]))['samples_24k']
38
+ audio_np = audio_data_resampled.cpu().numpy()
39
+ return audio_np
40
+
41
+ # Function to concatenate audio segments
42
+ def concatenate_audio_segments(segments):
43
+ concatenated_audio = np.concatenate(segments, axis=0)
44
+ concatenated_audio = concatenated_audio / np.max(np.abs(concatenated_audio))
45
+ return np.asarray(concatenated_audio, dtype=np.float32)
46
+
47
  @spaces.GPU
48
+ def whisper_speech_demo(multilingual_text, speaker_audio):
49
+ segments = parse_multilingual_text(multilingual_text)
50
+ if not segments:
51
+ return None, "No valid language segments found. Please use the format: <lang> text"
52
+
53
  pipe = Pipeline()
54
  speaker_url = speaker_audio if speaker_audio is not None else None
55
+ audio_segments = []
 
 
 
 
 
56
 
57
+ for lang, text in segments:
58
+ audio_np = generate_segment_audio(text, lang, speaker_url, pipe)
59
+ audio_segments.append(audio_np)
 
 
 
 
60
 
61
+ concatenated_audio = concatenate_audio_segments(audio_segments)
62
+ audio_stereo = np.stack((concatenated_audio, concatenated_audio), axis=-1)
 
 
 
 
 
63
  audio_stereo = audio_stereo.reshape(-1, 2)
64
+
 
 
65
  with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
66
  sf.write(tmp_file.name, audio_stereo, 24000, format='WAV', subtype='PCM_16')
67
+ return tmp_file.name
68
+
69
 
70
  with gr.Blocks() as demo:
71
  gr.Markdown(title)
72
+ output_audio = gr.Audio(label="Generated Speech")
73
+ generate_button = gr.Button("Try 🌟CollaboraπŸŒ¬οΈπŸ’¬πŸ“WhisperSpeech")
74
+ with gr.Row():
75
+ text_input = gr.Textbox(label="Enter multilingual text", placeholder="e.g., <en> Hello <fr> Bonjour <es> Hola", examples=["<en> Hello, how are you? <fr> Bonjour, comment Γ§a va?", "<de> Guten Tag <it> Buongiorno <jp> こんにけは"])
76
+ speaker_input = gr.Audio(label="Upload or Record Speaker Audio (optional)", sources=["upload", "microphone"], type="filepath", examples=["path/to/tonic.wav"])
77
+ with gr.Accordion("Available Languages and Their Tags"):
78
+ language_list = "\n".join([f"{lang}: {LANGUAGES[lang]}" for lang in LANGUAGES])
79
+ gr.Markdown(language_list)
80
+ generate_button.click(whisper_speech_demo, inputs=[text_input, speaker_input], outputs=output_audio)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
  demo.launch()