whisperspeech

Runtime error

Tonic commited on Jan 25

Commit

c4b4e50

•

1 Parent(s): f9aebc6

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -21,7 +21,8 @@ You can also use 🌬️💬📝WhisperSpeech by cloning this space. 🧬🔬
 We're **celebrating the release of the whisperspeech** at [the LAION community, if you love open source ai learn more here : https://laion.ai/](https://laion.ai/) big thanks to the folks at huggingface for the community grant 🤗
 ### How to Use
-Input text with the language identifiers provided to create a multilingual speech. Optionally you can add an audiosample to make a voice print. Scroll down and try the api <3 Gradio.
 """
 # text examples=["<en> Hello, how are you? <fr> Bonjour, comment ça va?", "<de> Guten Tag <it> Buongiorno <jp> こんにちは"]
@@ -46,9 +47,11 @@ def generate_segment_audio(text, lang, speaker_url, pipe):
     audio_np = audio_data_resampled.cpu().numpy()
     return audio_np
 def concatenate_audio_segments(segments):
-    max_len = max([seg.shape[0] for seg in segments])
-    padded_segments = [np.pad(seg, (0, max_len - seg.shape[0]), 'constant') for seg in segments]
     concatenated_audio = np.concatenate(padded_segments, axis=0)
     concatenated_audio = concatenated_audio / np.max(np.abs(concatenated_audio))
     return np.asarray(concatenated_audio, dtype=np.float32)

 We're **celebrating the release of the whisperspeech** at [the LAION community, if you love open source ai learn more here : https://laion.ai/](https://laion.ai/) big thanks to the folks at huggingface for the community grant 🤗
 ### How to Use
+Input text with the language identifiers provided to create a multilingual speech. Optionally you can add an audiosample to make a voice print.Scroll down and try the api <3 Gradio.
+This space runs on ZeroGPU, so **you need to be patient** while you acquire the GPU and load the model the first time you make a request !
 """
 # text examples=["<en> Hello, how are you? <fr> Bonjour, comment ça va?", "<de> Guten Tag <it> Buongiorno <jp> こんにちは"]
     audio_np = audio_data_resampled.cpu().numpy()
     return audio_np
+# this function pads each segment to the length of the longest segment which is not optimal
 def concatenate_audio_segments(segments):
+    mono_segments = [seg[:, 0] if seg.ndim > 1 else seg for seg in segments]
+    max_len = max(seg.shape[0] for seg in mono_segments)
+    padded_segments = [np.pad(seg, (0, max_len - seg.shape[0]), 'constant') for seg in mono_segments]
     concatenated_audio = np.concatenate(padded_segments, axis=0)
     concatenated_audio = concatenated_audio / np.max(np.abs(concatenated_audio))
     return np.asarray(concatenated_audio, dtype=np.float32)