PlotweaverModel commited on
Commit
9cf39bc
·
verified ·
1 Parent(s): e51934c

update for live streaming

Browse files
Files changed (2) hide show
  1. README.md +4 -5
  2. app.py +269 -121
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: Live Football Commentary - English to Yoruba
3
- emoji: 🏟️
4
  colorFrom: green
5
  colorTo: yellow
6
  sdk: gradio
@@ -19,8 +19,7 @@ tags:
19
  - yoruba
20
  - football
21
  - commentary
22
- - asr
23
- - tts
24
- - nllb
25
- short_description: Translate live English football commentary to Yoruba speech
26
  ---
 
1
  ---
2
  title: Live Football Commentary - English to Yoruba
3
+ emoji: "\U0001F3DF\uFE0F"
4
  colorFrom: green
5
  colorTo: yellow
6
  sdk: gradio
 
19
  - yoruba
20
  - football
21
  - commentary
22
+ - streaming
23
+ - real-time
24
+ short_description: Real-time English football commentary to Yoruba speech
 
25
  ---
app.py CHANGED
@@ -1,15 +1,18 @@
1
  """
2
- Live Football Commentary Pipeline — English → Yoruba
3
- =====================================================
4
- Gradio app for HuggingFace Spaces.
5
 
6
- Pipeline: ASR (Whisper) MT (NLLB-200) TTS (MMS-TTS Yoruba)
 
7
  """
8
 
9
  import torch
10
  import numpy as np
11
  import re
12
  import time
 
 
13
  import gradio as gr
14
  from transformers import (
15
  pipeline as hf_pipeline,
@@ -17,6 +20,9 @@ from transformers import (
17
  AutoModelForSeq2SeqLM,
18
  )
19
 
 
 
 
20
  # =============================================================================
21
  # Configuration
22
  # =============================================================================
@@ -31,6 +37,10 @@ MT_TGT_LANG = "yor_Latn"
31
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
32
  TORCH_DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
33
 
 
 
 
 
34
 
35
  # =============================================================================
36
  # Load models (runs once at startup)
@@ -39,7 +49,6 @@ TORCH_DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
39
  print(f"Device: {DEVICE} | Dtype: {TORCH_DTYPE}")
40
  print("Loading models...")
41
 
42
- # ASR
43
  print(f" Loading ASR: {ASR_MODEL_ID}")
44
  asr_pipe = hf_pipeline(
45
  "automatic-speech-recognition",
@@ -47,19 +56,17 @@ asr_pipe = hf_pipeline(
47
  device=DEVICE,
48
  torch_dtype=TORCH_DTYPE,
49
  )
50
- print(" ASR loaded")
51
 
52
- # MT
53
  print(f" Loading MT: {MT_MODEL_ID}")
54
  mt_tokenizer = AutoTokenizer.from_pretrained(MT_MODEL_ID)
55
  mt_model = AutoModelForSeq2SeqLM.from_pretrained(
56
- MT_MODEL_ID,
57
- torch_dtype=TORCH_DTYPE,
58
  ).to(DEVICE)
59
  mt_tokenizer.src_lang = MT_SRC_LANG
60
- print(" MT loaded ✓")
 
61
 
62
- # TTS
63
  print(f" Loading TTS: {TTS_MODEL_ID}")
64
  tts_pipe = hf_pipeline(
65
  "text-to-speech",
@@ -67,29 +74,23 @@ tts_pipe = hf_pipeline(
67
  device=DEVICE,
68
  torch_dtype=TORCH_DTYPE,
69
  )
70
- print(" TTS loaded")
71
  print("All models loaded!")
72
 
73
 
74
  # =============================================================================
75
- # Pipeline functions (from working Colab notebook)
76
  # =============================================================================
77
 
78
  def split_into_sentences(text):
79
- """Split raw ASR text into individual sentences for MT."""
80
  text = text.strip()
81
  if not text:
82
  return []
83
-
84
- # Normalize case
85
  text = '. '.join(s.strip().capitalize() for s in text.split('. ') if s.strip())
86
-
87
- # If text has punctuation, split on it
88
  if re.search(r'[.!?]', text):
89
  sentences = re.split(r'(?<=[.!?])\s+', text)
90
  return [s.strip() for s in sentences if s.strip()]
91
-
92
- # No punctuation — split into ~12 word chunks
93
  words = text.split()
94
  MAX_WORDS = 12
95
  sentences = []
@@ -103,21 +104,19 @@ def split_into_sentences(text):
103
 
104
 
105
  def transcribe(audio_array, sample_rate=16000):
106
- """ASR: English audio English text."""
 
 
107
  result = asr_pipe(
108
  {"raw": audio_array, "sampling_rate": sample_rate},
109
- chunk_length_s=15,
110
- batch_size=1,
111
  return_timestamps=False,
112
  )
113
  return result["text"].strip()
114
 
115
 
116
  def translate_sentence(text, max_length=256):
117
- """MT: Translate a single sentence from English to Yoruba."""
118
  inputs = mt_tokenizer(text, return_tensors="pt", truncation=True).to(DEVICE)
119
- tgt_lang_id = mt_tokenizer.convert_tokens_to_ids(MT_TGT_LANG)
120
-
121
  with torch.no_grad():
122
  output_ids = mt_model.generate(
123
  **inputs,
@@ -131,124 +130,228 @@ def translate_sentence(text, max_length=256):
131
  return mt_tokenizer.decode(output_ids[0], skip_special_tokens=True)
132
 
133
 
134
- def translate_long_text(text):
135
- """Split into sentences and translate each individually."""
136
  sentences = split_into_sentences(text)
137
- translations = []
138
- for sent in sentences:
139
- yo = translate_sentence(sent)
140
- translations.append(yo)
141
- return ' '.join(translations), sentences, translations
142
 
143
 
144
  def synthesize(text):
145
- """TTS: Yoruba text audio."""
 
 
146
  result = tts_pipe(text)
147
  audio = np.array(result["audio"]).squeeze()
148
  sr = result["sampling_rate"]
149
  return audio, sr
150
 
151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  # =============================================================================
153
  # Gradio interface functions
154
  # =============================================================================
155
 
156
- def process_audio(audio_input):
157
- """
158
- Full pipeline: English audio → Yoruba audio.
159
- audio_input: tuple of (sample_rate, numpy_array) from Gradio.
160
- """
161
  if audio_input is None:
162
- return None, "⚠️ No audio provided. Please upload or record audio."
163
 
164
  sample_rate, audio_array = audio_input
165
-
166
- # Convert to float32 mono if needed
167
  audio_array = audio_array.astype(np.float32)
168
  if audio_array.ndim > 1:
169
  audio_array = audio_array.mean(axis=1)
170
-
171
- # Normalize to [-1, 1] if integer audio
172
  if audio_array.max() > 1.0 or audio_array.min() < -1.0:
173
  audio_array = audio_array / max(abs(audio_array.max()), abs(audio_array.min()))
174
 
175
  total_start = time.time()
176
- log_lines = []
177
 
178
- # Step 1: ASR
179
  t0 = time.time()
180
- english_text = transcribe(audio_array, sample_rate)
181
- asr_time = time.time() - t0
182
- log_lines.append(f"**🎤 ASR** ({asr_time:.2f}s)")
183
- log_lines.append(f"English: {english_text}")
184
- log_lines.append("")
185
 
186
- if not english_text:
187
- return None, "⚠️ ASR returned empty text. Please try with clearer audio."
188
 
189
- # Step 2: MT (sentence by sentence)
190
- t0 = time.time()
191
- yoruba_text, en_sentences, yo_sentences = translate_long_text(english_text)
192
- mt_time = time.time() - t0
193
- log_lines.append(f"**🔄 Translation** ({mt_time:.2f}s)")
194
- for en_s, yo_s in zip(en_sentences, yo_sentences):
195
- log_lines.append(f" EN: {en_s}")
196
- log_lines.append(f" YO: {yo_s}")
197
- log_lines.append("")
198
-
199
- if not yoruba_text:
200
- return None, "⚠️ Translation returned empty text."
201
-
202
- # Step 3: TTS
203
  t0 = time.time()
204
- yoruba_audio, output_sr = synthesize(yoruba_text)
205
- tts_time = time.time() - t0
206
- log_lines.append(f"**🔊 TTS** ({tts_time:.2f}s) {len(yoruba_audio)/output_sr:.2f}s of audio")
 
 
 
 
 
207
 
208
- total = time.time() - total_start
209
- log_lines.append("")
210
- log_lines.append(f"**Total: {total:.2f}s**")
211
 
212
- log_output = "\n".join(log_lines)
 
 
 
 
213
 
214
- return (output_sr, yoruba_audio), log_output
215
 
216
 
217
- def process_text(english_text):
218
- """
219
- Text-only mode: English text Yoruba text + audio.
220
- Skips the ASR stage useful for testing MT + TTS.
221
- """
222
- if not english_text or not english_text.strip():
223
- return None, "⚠️ Please enter some English text."
224
 
225
- total_start = time.time()
226
- log_lines = []
227
 
228
  # MT
229
  t0 = time.time()
230
- yoruba_text, en_sentences, yo_sentences = translate_long_text(english_text.strip())
231
- mt_time = time.time() - t0
232
- log_lines.append(f"**🔄 Translation** ({mt_time:.2f}s)")
233
- for en_s, yo_s in zip(en_sentences, yo_sentences):
234
- log_lines.append(f" EN: {en_s}")
235
- log_lines.append(f" YO: {yo_s}")
236
- log_lines.append("")
237
-
238
- if not yoruba_text:
239
- return None, "⚠️ Translation returned empty text."
240
 
241
  # TTS
242
  t0 = time.time()
243
- yoruba_audio, output_sr = synthesize(yoruba_text)
244
- tts_time = time.time() - t0
245
- log_lines.append(f"**🔊 TTS** ({tts_time:.2f}s) {len(yoruba_audio)/output_sr:.2f}s of audio")
246
 
247
- total = time.time() - total_start
248
- log_lines.append("")
249
- log_lines.append(f"**Total: {total:.2f}s**")
250
 
251
- return (output_sr, yoruba_audio), "\n".join(log_lines)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
 
253
 
254
  # =============================================================================
@@ -256,15 +359,24 @@ def process_text(english_text):
256
  # =============================================================================
257
 
258
  DESCRIPTION = """
259
- # 🏟️ Live Football Commentary English Yoruba
260
 
261
  Translate English football commentary into Yoruba speech in real-time.
262
 
263
- **Pipeline:** ASR (Whisper) MT (NLLB-200) TTS (MMS-TTS Yoruba)
264
-
265
- Upload or record English commentary audio, and get back Yoruba audio + full transcript.
266
  """
267
 
 
 
 
 
 
 
 
 
 
 
 
268
  EXAMPLES_TEXT = [
269
  "And it's a brilliant goal from the striker!",
270
  "The referee has shown a yellow card. Corner kick for the home team.",
@@ -273,7 +385,7 @@ EXAMPLES_TEXT = [
273
  ]
274
 
275
  with gr.Blocks(
276
- title="Football Commentary EN→YO",
277
  theme=gr.themes.Soft(),
278
  ) as demo:
279
 
@@ -281,9 +393,47 @@ with gr.Blocks(
281
 
282
  with gr.Tabs():
283
 
284
- # ---- Tab 1: Audio Audio (Full Pipeline) ----
285
- with gr.TabItem("🎙️ Audio → Audio (Full Pipeline)"):
286
- gr.Markdown("Upload or record English commentary. The pipeline will transcribe, translate, and synthesize Yoruba audio.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
287
 
288
  with gr.Row():
289
  with gr.Column():
@@ -292,21 +442,21 @@ with gr.Blocks(
292
  type="numpy",
293
  sources=["upload", "microphone"],
294
  )
295
- audio_submit_btn = gr.Button("Translate to Yoruba", variant="primary", size="lg")
296
 
297
  with gr.Column():
298
  audio_output = gr.Audio(label="Yoruba Commentary Audio", type="numpy")
299
  audio_log = gr.Markdown(label="Pipeline Log")
300
 
301
- audio_submit_btn.click(
302
- fn=process_audio,
303
  inputs=[audio_input],
304
  outputs=[audio_output, audio_log],
305
  )
306
 
307
- # ---- Tab 2: Text Audio (Skip ASR) ----
308
- with gr.TabItem("📝 Text Audio (Translation + TTS)"):
309
- gr.Markdown("Type or paste English text to translate to Yoruba and hear the result. Useful for testing without audio.")
310
 
311
  with gr.Row():
312
  with gr.Column():
@@ -315,8 +465,7 @@ with gr.Blocks(
315
  placeholder="Type English football commentary here...",
316
  lines=4,
317
  )
318
- text_submit_btn = gr.Button("Translate to Yoruba", variant="primary", size="lg")
319
-
320
  gr.Examples(
321
  examples=[[e] for e in EXAMPLES_TEXT],
322
  inputs=[text_input],
@@ -327,20 +476,19 @@ with gr.Blocks(
327
  text_audio_output = gr.Audio(label="Yoruba Audio", type="numpy")
328
  text_log = gr.Markdown(label="Pipeline Log")
329
 
330
- text_submit_btn.click(
331
- fn=process_text,
332
  inputs=[text_input],
333
  outputs=[text_audio_output, text_log],
334
  )
335
 
336
  gr.Markdown("""
337
  ---
338
- **Models used:**
339
  [ASR: PlotweaverAI/whisper-small-de-en](https://huggingface.co/PlotweaverAI/whisper-small-de-en) |
340
  [MT: PlotweaverAI/nllb-200-distilled-600M-african-6lang](https://huggingface.co/PlotweaverAI/nllb-200-distilled-600M-african-6lang) |
341
  [TTS: PlotweaverAI/yoruba-mms-tts-new](https://huggingface.co/PlotweaverAI/yoruba-mms-tts-new)
342
  """)
343
 
344
- # Launch
345
  if __name__ == "__main__":
346
  demo.launch()
 
1
  """
2
+ Live Football Commentary Pipeline — Real-Time Streaming
3
+ ========================================================
4
+ English Yoruba with ~3-5 second latency.
5
 
6
+ Uses Gradio's streaming audio API to continuously capture mic input,
7
+ process chunks through ASR → MT → TTS, and play back Yoruba audio.
8
  """
9
 
10
  import torch
11
  import numpy as np
12
  import re
13
  import time
14
+ import io
15
+ import logging
16
  import gradio as gr
17
  from transformers import (
18
  pipeline as hf_pipeline,
 
20
  AutoModelForSeq2SeqLM,
21
  )
22
 
23
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
24
+ logger = logging.getLogger(__name__)
25
+
26
  # =============================================================================
27
  # Configuration
28
  # =============================================================================
 
37
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
38
  TORCH_DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
39
 
40
+ # Streaming config
41
+ CHUNK_DURATION_S = 5 # Process every N seconds of audio
42
+ TARGET_SR = 16000 # Whisper expects 16kHz
43
+
44
 
45
  # =============================================================================
46
  # Load models (runs once at startup)
 
49
  print(f"Device: {DEVICE} | Dtype: {TORCH_DTYPE}")
50
  print("Loading models...")
51
 
 
52
  print(f" Loading ASR: {ASR_MODEL_ID}")
53
  asr_pipe = hf_pipeline(
54
  "automatic-speech-recognition",
 
56
  device=DEVICE,
57
  torch_dtype=TORCH_DTYPE,
58
  )
59
+ print(" ASR loaded")
60
 
 
61
  print(f" Loading MT: {MT_MODEL_ID}")
62
  mt_tokenizer = AutoTokenizer.from_pretrained(MT_MODEL_ID)
63
  mt_model = AutoModelForSeq2SeqLM.from_pretrained(
64
+ MT_MODEL_ID, torch_dtype=TORCH_DTYPE
 
65
  ).to(DEVICE)
66
  mt_tokenizer.src_lang = MT_SRC_LANG
67
+ tgt_lang_id = mt_tokenizer.convert_tokens_to_ids(MT_TGT_LANG)
68
+ print(f" MT loaded (target token id: {tgt_lang_id})")
69
 
 
70
  print(f" Loading TTS: {TTS_MODEL_ID}")
71
  tts_pipe = hf_pipeline(
72
  "text-to-speech",
 
74
  device=DEVICE,
75
  torch_dtype=TORCH_DTYPE,
76
  )
77
+ print(" TTS loaded")
78
  print("All models loaded!")
79
 
80
 
81
  # =============================================================================
82
+ # Pipeline functions
83
  # =============================================================================
84
 
85
  def split_into_sentences(text):
86
+ """Split raw ASR text into individual sentences."""
87
  text = text.strip()
88
  if not text:
89
  return []
 
 
90
  text = '. '.join(s.strip().capitalize() for s in text.split('. ') if s.strip())
 
 
91
  if re.search(r'[.!?]', text):
92
  sentences = re.split(r'(?<=[.!?])\s+', text)
93
  return [s.strip() for s in sentences if s.strip()]
 
 
94
  words = text.split()
95
  MAX_WORDS = 12
96
  sentences = []
 
104
 
105
 
106
  def transcribe(audio_array, sample_rate=16000):
107
+ """ASR: English audio to text."""
108
+ if len(audio_array) < 1600: # Less than 0.1s
109
+ return ""
110
  result = asr_pipe(
111
  {"raw": audio_array, "sampling_rate": sample_rate},
 
 
112
  return_timestamps=False,
113
  )
114
  return result["text"].strip()
115
 
116
 
117
  def translate_sentence(text, max_length=256):
118
+ """MT: Single sentence English to Yoruba."""
119
  inputs = mt_tokenizer(text, return_tensors="pt", truncation=True).to(DEVICE)
 
 
120
  with torch.no_grad():
121
  output_ids = mt_model.generate(
122
  **inputs,
 
130
  return mt_tokenizer.decode(output_ids[0], skip_special_tokens=True)
131
 
132
 
133
+ def translate_text(text):
134
+ """Split and translate sentence by sentence."""
135
  sentences = split_into_sentences(text)
136
+ if not sentences:
137
+ return ""
138
+ translations = [translate_sentence(s) for s in sentences]
139
+ return ' '.join(translations)
 
140
 
141
 
142
  def synthesize(text):
143
+ """TTS: Yoruba text to audio."""
144
+ if not text.strip():
145
+ return np.array([], dtype=np.float32), TARGET_SR
146
  result = tts_pipe(text)
147
  audio = np.array(result["audio"]).squeeze()
148
  sr = result["sampling_rate"]
149
  return audio, sr
150
 
151
 
152
+ def process_chunk(audio_array, sample_rate):
153
+ """Full pipeline on a single audio chunk."""
154
+ t_start = time.time()
155
+
156
+ # ASR
157
+ english = transcribe(audio_array, sample_rate)
158
+ if not english:
159
+ return None, None, "", "", 0
160
+
161
+ # MT
162
+ yoruba = translate_text(english)
163
+ if not yoruba:
164
+ return None, None, english, "", 0
165
+
166
+ # TTS
167
+ audio_out, sr_out = synthesize(yoruba)
168
+ if len(audio_out) == 0:
169
+ return None, None, english, yoruba, 0
170
+
171
+ elapsed = time.time() - t_start
172
+ logger.info(f"Chunk processed in {elapsed:.2f}s: EN='{english[:60]}' -> YO='{yoruba[:60]}'")
173
+
174
+ return audio_out, sr_out, english, yoruba, elapsed
175
+
176
+
177
+ # =============================================================================
178
+ # Streaming state management
179
+ # =============================================================================
180
+
181
+ class StreamState:
182
+ """Manages the audio buffer for streaming mode."""
183
+
184
+ def __init__(self, chunk_duration_s=CHUNK_DURATION_S):
185
+ self.chunk_duration_s = chunk_duration_s
186
+ self.audio_buffer = np.array([], dtype=np.float32)
187
+ self.buffer_sr = TARGET_SR
188
+ self.transcript_en = []
189
+ self.transcript_yo = []
190
+ self.chunk_count = 0
191
+ self.total_time = 0.0
192
+
193
+ def reset(self):
194
+ self.audio_buffer = np.array([], dtype=np.float32)
195
+ self.transcript_en = []
196
+ self.transcript_yo = []
197
+ self.chunk_count = 0
198
+ self.total_time = 0.0
199
+
200
+
201
  # =============================================================================
202
  # Gradio interface functions
203
  # =============================================================================
204
 
205
+ def process_audio_upload(audio_input):
206
+ """Batch mode: upload/record full audio, get translation back."""
 
 
 
207
  if audio_input is None:
208
+ return None, "Please upload or record audio."
209
 
210
  sample_rate, audio_array = audio_input
 
 
211
  audio_array = audio_array.astype(np.float32)
212
  if audio_array.ndim > 1:
213
  audio_array = audio_array.mean(axis=1)
 
 
214
  if audio_array.max() > 1.0 or audio_array.min() < -1.0:
215
  audio_array = audio_array / max(abs(audio_array.max()), abs(audio_array.min()))
216
 
217
  total_start = time.time()
218
+ log = []
219
 
220
+ # ASR
221
  t0 = time.time()
222
+ english = transcribe(audio_array, sample_rate)
223
+ log.append(f"**ASR** ({time.time()-t0:.2f}s)\n{english}")
 
 
 
224
 
225
+ if not english:
226
+ return None, "ASR returned empty text. Try clearer audio."
227
 
228
+ # MT
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  t0 = time.time()
230
+ sentences = split_into_sentences(english)
231
+ translations = []
232
+ for s in sentences:
233
+ yo = translate_sentence(s)
234
+ translations.append(yo)
235
+ log.append(f" EN: {s}\n YO: {yo}")
236
+ yoruba = ' '.join(translations)
237
+ log.append(f"**MT** ({time.time()-t0:.2f}s)")
238
 
239
+ if not yoruba:
240
+ return None, "Translation returned empty."
 
241
 
242
+ # TTS
243
+ t0 = time.time()
244
+ audio_out, sr_out = synthesize(yoruba)
245
+ log.append(f"**TTS** ({time.time()-t0:.2f}s) = {len(audio_out)/sr_out:.1f}s audio")
246
+ log.append(f"\n**Total: {time.time()-total_start:.2f}s**")
247
 
248
+ return (sr_out, audio_out), "\n".join(log)
249
 
250
 
251
+ def process_text_input(text):
252
+ """Text mode: type English, get Yoruba audio."""
253
+ if not text or not text.strip():
254
+ return None, "Please enter some English text."
 
 
 
255
 
256
+ t_total = time.time()
257
+ log = []
258
 
259
  # MT
260
  t0 = time.time()
261
+ sentences = split_into_sentences(text.strip())
262
+ translations = []
263
+ for s in sentences:
264
+ yo = translate_sentence(s)
265
+ translations.append(yo)
266
+ log.append(f"EN: {s}\nYO: {yo}\n")
267
+ yoruba = ' '.join(translations)
268
+ log.append(f"**MT** ({time.time()-t0:.2f}s)")
 
 
269
 
270
  # TTS
271
  t0 = time.time()
272
+ audio_out, sr_out = synthesize(yoruba)
273
+ log.append(f"**TTS** ({time.time()-t0:.2f}s) = {len(audio_out)/sr_out:.1f}s audio")
274
+ log.append(f"\n**Total: {time.time()-t_total:.2f}s**")
275
 
276
+ return (sr_out, audio_out), "\n".join(log)
 
 
277
 
278
+
279
+ def streaming_process(audio_input, state):
280
+ """
281
+ Streaming mode: receives audio chunks from the microphone,
282
+ buffers them, and processes when enough has accumulated.
283
+
284
+ This function is called repeatedly by Gradio's streaming API
285
+ each time a new audio chunk arrives from the mic.
286
+ """
287
+ if state is None:
288
+ state = StreamState()
289
+
290
+ if audio_input is None:
291
+ return None, format_live_log(state), state
292
+
293
+ sample_rate, audio_chunk = audio_input
294
+ audio_chunk = audio_chunk.astype(np.float32)
295
+ if audio_chunk.ndim > 1:
296
+ audio_chunk = audio_chunk.mean(axis=1)
297
+ if audio_chunk.max() > 1.0 or audio_chunk.min() < -1.0:
298
+ max_val = max(abs(audio_chunk.max()), abs(audio_chunk.min()))
299
+ if max_val > 0:
300
+ audio_chunk = audio_chunk / max_val
301
+
302
+ # Add to buffer
303
+ state.buffer_sr = sample_rate
304
+ state.audio_buffer = np.concatenate([state.audio_buffer, audio_chunk])
305
+
306
+ required_samples = int(state.chunk_duration_s * sample_rate)
307
+
308
+ # Not enough audio yet
309
+ if len(state.audio_buffer) < required_samples:
310
+ buffered_s = len(state.audio_buffer) / sample_rate
311
+ return None, format_live_log(state, buffered_s), state
312
+
313
+ # Extract chunk and process
314
+ chunk = state.audio_buffer[:required_samples]
315
+ state.audio_buffer = state.audio_buffer[required_samples:]
316
+
317
+ audio_out, sr_out, english, yoruba, elapsed = process_chunk(chunk, sample_rate)
318
+
319
+ if english:
320
+ state.chunk_count += 1
321
+ state.total_time += elapsed
322
+ state.transcript_en.append(english)
323
+ state.transcript_yo.append(yoruba)
324
+
325
+ if audio_out is not None and len(audio_out) > 0:
326
+ return (sr_out, audio_out), format_live_log(state), state
327
+ else:
328
+ return None, format_live_log(state), state
329
+
330
+
331
+ def format_live_log(state, buffered_s=None):
332
+ """Format the live transcript log."""
333
+ lines = [f"**Chunks processed:** {state.chunk_count}"]
334
+ if state.chunk_count > 0:
335
+ avg = state.total_time / state.chunk_count
336
+ lines.append(f"**Avg processing time:** {avg:.2f}s per chunk")
337
+ if buffered_s is not None:
338
+ lines.append(f"**Buffering:** {buffered_s:.1f}s / {CHUNK_DURATION_S}s")
339
+ lines.append("")
340
+ lines.append("---")
341
+ lines.append("**Live transcript:**\n")
342
+
343
+ # Show last 10 chunks
344
+ start = max(0, len(state.transcript_en) - 10)
345
+ for i in range(start, len(state.transcript_en)):
346
+ lines.append(f"**[{i+1}]** EN: {state.transcript_en[i]}")
347
+ lines.append(f" YO: {state.transcript_yo[i]}\n")
348
+
349
+ return "\n".join(lines)
350
+
351
+
352
+ def clear_stream_state():
353
+ """Reset the streaming state."""
354
+ return None, "Stream cleared. Click Start to begin.", StreamState()
355
 
356
 
357
  # =============================================================================
 
359
  # =============================================================================
360
 
361
  DESCRIPTION = """
362
+ # Live Football Commentary \u2014 English \u2192 Yoruba
363
 
364
  Translate English football commentary into Yoruba speech in real-time.
365
 
366
+ **Pipeline:** ASR (Whisper) \u2192 MT (NLLB-200) \u2192 TTS (MMS-TTS Yoruba)
 
 
367
  """
368
 
369
+ STREAMING_INSTRUCTIONS = """
370
+ ### How to use live streaming:
371
+ 1. Click the **microphone** button to start recording
372
+ 2. Speak English commentary naturally
373
+ 3. Every **{chunk_dur}s**, the pipeline processes your audio and plays back Yoruba
374
+ 4. The transcript updates live below
375
+ 5. Click **Clear** to reset
376
+
377
+ **Expected latency:** ~3\u20135 seconds behind your speech.
378
+ """.format(chunk_dur=CHUNK_DURATION_S)
379
+
380
  EXAMPLES_TEXT = [
381
  "And it's a brilliant goal from the striker!",
382
  "The referee has shown a yellow card. Corner kick for the home team.",
 
385
  ]
386
 
387
  with gr.Blocks(
388
+ title="Football Commentary EN\u2192YO",
389
  theme=gr.themes.Soft(),
390
  ) as demo:
391
 
 
393
 
394
  with gr.Tabs():
395
 
396
+ # ---- Tab 1: LIVE STREAMING ----
397
+ with gr.TabItem("Live Streaming"):
398
+ gr.Markdown(STREAMING_INSTRUCTIONS)
399
+
400
+ stream_state = gr.State(StreamState())
401
+
402
+ with gr.Row():
403
+ with gr.Column():
404
+ stream_input = gr.Audio(
405
+ label="Microphone (streaming)",
406
+ type="numpy",
407
+ sources=["microphone"],
408
+ streaming=True,
409
+ )
410
+ clear_btn = gr.Button("Clear & Reset", variant="secondary")
411
+
412
+ with gr.Column():
413
+ stream_output = gr.Audio(
414
+ label="Yoruba Output",
415
+ type="numpy",
416
+ autoplay=True,
417
+ )
418
+ stream_log = gr.Markdown(
419
+ label="Live Transcript",
420
+ value="Waiting for audio input..."
421
+ )
422
+
423
+ stream_input.stream(
424
+ fn=streaming_process,
425
+ inputs=[stream_input, stream_state],
426
+ outputs=[stream_output, stream_log, stream_state],
427
+ )
428
+
429
+ clear_btn.click(
430
+ fn=clear_stream_state,
431
+ outputs=[stream_output, stream_log, stream_state],
432
+ )
433
+
434
+ # ---- Tab 2: Upload/Record (Batch) ----
435
+ with gr.TabItem("Upload / Record (Batch)"):
436
+ gr.Markdown("Upload or record English commentary. Full pipeline processes after recording.")
437
 
438
  with gr.Row():
439
  with gr.Column():
 
442
  type="numpy",
443
  sources=["upload", "microphone"],
444
  )
445
+ audio_submit = gr.Button("Translate to Yoruba", variant="primary", size="lg")
446
 
447
  with gr.Column():
448
  audio_output = gr.Audio(label="Yoruba Commentary Audio", type="numpy")
449
  audio_log = gr.Markdown(label="Pipeline Log")
450
 
451
+ audio_submit.click(
452
+ fn=process_audio_upload,
453
  inputs=[audio_input],
454
  outputs=[audio_output, audio_log],
455
  )
456
 
457
+ # ---- Tab 3: Text Input ----
458
+ with gr.TabItem("Text \u2192 Audio"):
459
+ gr.Markdown("Type English text to translate to Yoruba and hear the result.")
460
 
461
  with gr.Row():
462
  with gr.Column():
 
465
  placeholder="Type English football commentary here...",
466
  lines=4,
467
  )
468
+ text_submit = gr.Button("Translate to Yoruba", variant="primary", size="lg")
 
469
  gr.Examples(
470
  examples=[[e] for e in EXAMPLES_TEXT],
471
  inputs=[text_input],
 
476
  text_audio_output = gr.Audio(label="Yoruba Audio", type="numpy")
477
  text_log = gr.Markdown(label="Pipeline Log")
478
 
479
+ text_submit.click(
480
+ fn=process_text_input,
481
  inputs=[text_input],
482
  outputs=[text_audio_output, text_log],
483
  )
484
 
485
  gr.Markdown("""
486
  ---
487
+ **Models:**
488
  [ASR: PlotweaverAI/whisper-small-de-en](https://huggingface.co/PlotweaverAI/whisper-small-de-en) |
489
  [MT: PlotweaverAI/nllb-200-distilled-600M-african-6lang](https://huggingface.co/PlotweaverAI/nllb-200-distilled-600M-african-6lang) |
490
  [TTS: PlotweaverAI/yoruba-mms-tts-new](https://huggingface.co/PlotweaverAI/yoruba-mms-tts-new)
491
  """)
492
 
 
493
  if __name__ == "__main__":
494
  demo.launch()