Spaces:

herimor
/

voxtream

Running on Zero

App Files Files Community

herimor commited on Sep 28

Commit

fd0d55b

1 Parent(s): 07fe0e2

Add streaming output

Browse files

Files changed (5) hide show

.gitattributes +2 -0
app.py +53 -10
gradio_cached_examples/16/Synthesized audio/95f83d950a0400b268bd/tmppmcwrg5n +3 -0
gradio_cached_examples/16/Synthesized audio/b5933b8060d980ce1ea1/tmp339_glws +3 -0
gradio_cached_examples/16/log.csv +3 -0

.gitattributes CHANGED Viewed

@@ -35,3 +35,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 assets/app/male.wav filter=lfs diff=lfs merge=lfs -text
 assets/app/female.wav filter=lfs diff=lfs merge=lfs -text

 *tfevents* filter=lfs diff=lfs merge=lfs -text
 assets/app/male.wav filter=lfs diff=lfs merge=lfs -text
 assets/app/female.wav filter=lfs diff=lfs merge=lfs -text
+gradio_cached_examples/16/Synthesized[[:space:]]audio/95f83d950a0400b268bd/tmppmcwrg5n filter=lfs diff=lfs merge=lfs -text
+gradio_cached_examples/16/Synthesized[[:space:]]audio/b5933b8060d980ce1ea1/tmp339_glws filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -37,6 +37,9 @@ nltk.download("punkt", quiet=True, raise_on_error=True)
 # Initialize speech generator
 speech_generator = SpeechGenerator(config)
 CUSTOM_CSS = """
 /* overall width */
 .gradio-container {max-width: 1100px !important}
@@ -51,6 +54,27 @@ CUSTOM_CSS = """
 audio {outline: none;}
 """
 @spaces.GPU
 def synthesize_fn(prompt_audio_path, prompt_text, target_text):
@@ -69,17 +93,30 @@ def synthesize_fn(prompt_audio_path, prompt_text, target_text):
         prompt_audio_path=Path(prompt_audio_path),
         text=target_text,
     )
-    frames = [frame for frame, _ in stream]
-    if not frames:
-        return None
-    waveform = np.concatenate(frames).astype(np.float32)
-    # Fade out
-    fade_len_sec = 0.1
-    fade_out = np.linspace(1.0, 0.0, int(config.mimi_sr * fade_len_sec))
-    waveform[-int(config.mimi_sr * fade_len_sec) :] *= fade_out
-    return (config.mimi_sr, waveform)
 def main():
@@ -108,9 +145,10 @@ def main():
                     placeholder="What you want the model to say",
                 )
                 output_audio = gr.Audio(
-                    type="numpy",
                     label="Synthesized audio",
                     interactive=False,
                 )
         with gr.Row():
@@ -140,6 +178,11 @@ def main():
         # --- Wire up actions ---
         submit_btn.click(
             fn=synthesize_fn,
             inputs=[prompt_audio, prompt_text, target_text],
             outputs=output_audio,

 # Initialize speech generator
 speech_generator = SpeechGenerator(config)
+FADE_OUT_SEC = 0.10
+MIN_CHUNK_SEC = 0.2
+CHUNK_SIZE = int(config.mimi_sr * MIN_CHUNK_SEC)
 CUSTOM_CSS = """
 /* overall width */
 .gradio-container {max-width: 1100px !important}
 audio {outline: none;}
 """
+def float32_to_int16(audio_float32: np.ndarray) -> np.ndarray:
+    """
+    Convert float32 audio samples (-1.0 to 1.0) to int16 PCM samples.
+    Parameters:
+        audio_float32 (np.ndarray): Input float32 audio samples.
+    Returns:
+        np.ndarray: Output int16 audio samples.
+    """
+    if audio_float32.dtype != np.float32:
+        raise ValueError("Input must be a float32 numpy array")
+    # Clip to avoid overflow after scaling
+    audio_clipped = np.clip(audio_float32, -1.0, 1.0)
+    # Scale and convert
+    audio_int16 = (audio_clipped * 32767).astype(np.int16)
+    return audio_int16
 @spaces.GPU
 def synthesize_fn(prompt_audio_path, prompt_text, target_text):
         prompt_audio_path=Path(prompt_audio_path),
         text=target_text,
     )
+    buffer = []
+    buffer_len = 0
+    for frame, _ in stream:
+        buffer.append(frame)
+        buffer_len += frame.shape[0]
+        if buffer_len >= CHUNK_SIZE:
+            audio = np.concatenate(buffer)
+            yield (config.mimi_sr, float32_to_int16(audio))
+            # Reset buffer and length
+            buffer = []
+            buffer_len = 0
+    # Handle any remaining audio in the buffer
+    if buffer_len > 0:
+        final = np.concatenate(buffer)
+        nfade = min(int(config.mimi_sr * FADE_OUT_SEC), final.shape[0])
+        if nfade > 0:
+            fade = np.linspace(1.0, 0.0, nfade, dtype=np.float32)
+            final[-nfade:] *= fade
+        yield (config.mimi_sr, float32_to_int16(final))
 def main():
                     placeholder="What you want the model to say",
                 )
                 output_audio = gr.Audio(
                     label="Synthesized audio",
                     interactive=False,
+                    streaming=True,
+                    autoplay=True,
                 )
         with gr.Row():
         # --- Wire up actions ---
         submit_btn.click(
+            fn=lambda a, p, t: None,  # clears the audio value
+            inputs=[prompt_audio, prompt_text, target_text],
+            outputs=output_audio,
+            show_progress="hidden",
+        ).then(
             fn=synthesize_fn,
             inputs=[prompt_audio, prompt_text, target_text],
             outputs=output_audio,

gradio_cached_examples/16/Synthesized audio/95f83d950a0400b268bd/tmppmcwrg5n ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ac85b968e44a98af1e2f344ed56f68c700cd2b99a3c114d2552c66b2b6c2e957
+size 326444

gradio_cached_examples/16/Synthesized audio/b5933b8060d980ce1ea1/tmp339_glws ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7a15baf860116573dd4985238c7a05fe3120f3732b43bef7d8c8aa22e07b5fbd
+size 322604

gradio_cached_examples/16/log.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+Synthesized audio,flag,username,timestamp
+"{""path"": ""gradio_cached_examples/16/Synthesized audio/95f83d950a0400b268bd/tmppmcwrg5n"", ""url"": null, ""size"": null, ""orig_name"": null, ""mime_type"": null, ""is_stream"": false, ""meta"": {""_type"": ""gradio.FileData""}}",,,2025-09-28 16:43:00.957637
+"{""path"": ""gradio_cached_examples/16/Synthesized audio/b5933b8060d980ce1ea1/tmp339_glws"", ""url"": null, ""size"": null, ""orig_name"": null, ""mime_type"": null, ""is_stream"": false, ""meta"": {""_type"": ""gradio.FileData""}}",,,2025-09-28 16:43:06.729484