herimor commited on
Commit
fd0d55b
·
1 Parent(s): 07fe0e2

Add streaming output

Browse files
.gitattributes CHANGED
@@ -35,3 +35,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  assets/app/male.wav filter=lfs diff=lfs merge=lfs -text
37
  assets/app/female.wav filter=lfs diff=lfs merge=lfs -text
 
 
 
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  assets/app/male.wav filter=lfs diff=lfs merge=lfs -text
37
  assets/app/female.wav filter=lfs diff=lfs merge=lfs -text
38
+ gradio_cached_examples/16/Synthesized[[:space:]]audio/95f83d950a0400b268bd/tmppmcwrg5n filter=lfs diff=lfs merge=lfs -text
39
+ gradio_cached_examples/16/Synthesized[[:space:]]audio/b5933b8060d980ce1ea1/tmp339_glws filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -37,6 +37,9 @@ nltk.download("punkt", quiet=True, raise_on_error=True)
37
  # Initialize speech generator
38
  speech_generator = SpeechGenerator(config)
39
 
 
 
 
40
  CUSTOM_CSS = """
41
  /* overall width */
42
  .gradio-container {max-width: 1100px !important}
@@ -51,6 +54,27 @@ CUSTOM_CSS = """
51
  audio {outline: none;}
52
  """
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  @spaces.GPU
56
  def synthesize_fn(prompt_audio_path, prompt_text, target_text):
@@ -69,17 +93,30 @@ def synthesize_fn(prompt_audio_path, prompt_text, target_text):
69
  prompt_audio_path=Path(prompt_audio_path),
70
  text=target_text,
71
  )
72
- frames = [frame for frame, _ in stream]
73
- if not frames:
74
- return None
75
- waveform = np.concatenate(frames).astype(np.float32)
76
 
77
- # Fade out
78
- fade_len_sec = 0.1
79
- fade_out = np.linspace(1.0, 0.0, int(config.mimi_sr * fade_len_sec))
80
- waveform[-int(config.mimi_sr * fade_len_sec) :] *= fade_out
 
 
 
 
 
 
81
 
82
- return (config.mimi_sr, waveform)
 
 
 
 
 
 
 
 
 
 
 
83
 
84
 
85
  def main():
@@ -108,9 +145,10 @@ def main():
108
  placeholder="What you want the model to say",
109
  )
110
  output_audio = gr.Audio(
111
- type="numpy",
112
  label="Synthesized audio",
113
  interactive=False,
 
 
114
  )
115
 
116
  with gr.Row():
@@ -140,6 +178,11 @@ def main():
140
 
141
  # --- Wire up actions ---
142
  submit_btn.click(
 
 
 
 
 
143
  fn=synthesize_fn,
144
  inputs=[prompt_audio, prompt_text, target_text],
145
  outputs=output_audio,
 
37
  # Initialize speech generator
38
  speech_generator = SpeechGenerator(config)
39
 
40
+ FADE_OUT_SEC = 0.10
41
+ MIN_CHUNK_SEC = 0.2
42
+ CHUNK_SIZE = int(config.mimi_sr * MIN_CHUNK_SEC)
43
  CUSTOM_CSS = """
44
  /* overall width */
45
  .gradio-container {max-width: 1100px !important}
 
54
  audio {outline: none;}
55
  """
56
 
57
+ def float32_to_int16(audio_float32: np.ndarray) -> np.ndarray:
58
+ """
59
+ Convert float32 audio samples (-1.0 to 1.0) to int16 PCM samples.
60
+
61
+ Parameters:
62
+ audio_float32 (np.ndarray): Input float32 audio samples.
63
+
64
+ Returns:
65
+ np.ndarray: Output int16 audio samples.
66
+ """
67
+ if audio_float32.dtype != np.float32:
68
+ raise ValueError("Input must be a float32 numpy array")
69
+
70
+ # Clip to avoid overflow after scaling
71
+ audio_clipped = np.clip(audio_float32, -1.0, 1.0)
72
+
73
+ # Scale and convert
74
+ audio_int16 = (audio_clipped * 32767).astype(np.int16)
75
+
76
+ return audio_int16
77
+
78
 
79
  @spaces.GPU
80
  def synthesize_fn(prompt_audio_path, prompt_text, target_text):
 
93
  prompt_audio_path=Path(prompt_audio_path),
94
  text=target_text,
95
  )
 
 
 
 
96
 
97
+ buffer = []
98
+ buffer_len = 0
99
+
100
+ for frame, _ in stream:
101
+ buffer.append(frame)
102
+ buffer_len += frame.shape[0]
103
+
104
+ if buffer_len >= CHUNK_SIZE:
105
+ audio = np.concatenate(buffer)
106
+ yield (config.mimi_sr, float32_to_int16(audio))
107
 
108
+ # Reset buffer and length
109
+ buffer = []
110
+ buffer_len = 0
111
+
112
+ # Handle any remaining audio in the buffer
113
+ if buffer_len > 0:
114
+ final = np.concatenate(buffer)
115
+ nfade = min(int(config.mimi_sr * FADE_OUT_SEC), final.shape[0])
116
+ if nfade > 0:
117
+ fade = np.linspace(1.0, 0.0, nfade, dtype=np.float32)
118
+ final[-nfade:] *= fade
119
+ yield (config.mimi_sr, float32_to_int16(final))
120
 
121
 
122
  def main():
 
145
  placeholder="What you want the model to say",
146
  )
147
  output_audio = gr.Audio(
 
148
  label="Synthesized audio",
149
  interactive=False,
150
+ streaming=True,
151
+ autoplay=True,
152
  )
153
 
154
  with gr.Row():
 
178
 
179
  # --- Wire up actions ---
180
  submit_btn.click(
181
+ fn=lambda a, p, t: None, # clears the audio value
182
+ inputs=[prompt_audio, prompt_text, target_text],
183
+ outputs=output_audio,
184
+ show_progress="hidden",
185
+ ).then(
186
  fn=synthesize_fn,
187
  inputs=[prompt_audio, prompt_text, target_text],
188
  outputs=output_audio,
gradio_cached_examples/16/Synthesized audio/95f83d950a0400b268bd/tmppmcwrg5n ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac85b968e44a98af1e2f344ed56f68c700cd2b99a3c114d2552c66b2b6c2e957
3
+ size 326444
gradio_cached_examples/16/Synthesized audio/b5933b8060d980ce1ea1/tmp339_glws ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a15baf860116573dd4985238c7a05fe3120f3732b43bef7d8c8aa22e07b5fbd
3
+ size 322604
gradio_cached_examples/16/log.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Synthesized audio,flag,username,timestamp
2
+ "{""path"": ""gradio_cached_examples/16/Synthesized audio/95f83d950a0400b268bd/tmppmcwrg5n"", ""url"": null, ""size"": null, ""orig_name"": null, ""mime_type"": null, ""is_stream"": false, ""meta"": {""_type"": ""gradio.FileData""}}",,,2025-09-28 16:43:00.957637
3
+ "{""path"": ""gradio_cached_examples/16/Synthesized audio/b5933b8060d980ce1ea1/tmp339_glws"", ""url"": null, ""size"": null, ""orig_name"": null, ""mime_type"": null, ""is_stream"": false, ""meta"": {""_type"": ""gradio.FileData""}}",,,2025-09-28 16:43:06.729484