Spaces:

PlotweaverModel
/

Live-Football-Commentary

Running

App Files Files Community

PlotweaverModel commited on 12 days ago

Commit

9cf39bc

verified ·

1 Parent(s): e51934c

update for live streaming

Browse files

Files changed (2) hide show

README.md +4 -5
app.py +269 -121

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: Live Football Commentary - English to Yoruba
-emoji: 🏟️
 colorFrom: green
 colorTo: yellow
 sdk: gradio
@@ -19,8 +19,7 @@ tags:
   - yoruba
   - football
   - commentary
-  - asr
-  - tts
-  - nllb
-short_description: Translate live English football commentary to Yoruba speech
 ---

 ---
 title: Live Football Commentary - English to Yoruba
+emoji: "\U0001F3DF\uFE0F"
 colorFrom: green
 colorTo: yellow
 sdk: gradio
   - yoruba
   - football
   - commentary
+  - streaming
+  - real-time
+short_description: Real-time English football commentary to Yoruba speech
 ---

app.py CHANGED Viewed

@@ -1,15 +1,18 @@
 """
-Live Football Commentary Pipeline — English → Yoruba
-=====================================================
-Gradio app for HuggingFace Spaces.
-Pipeline: ASR (Whisper) → MT (NLLB-200) → TTS (MMS-TTS Yoruba)
 """
 import torch
 import numpy as np
 import re
 import time
 import gradio as gr
 from transformers import (
     pipeline as hf_pipeline,
@@ -17,6 +20,9 @@ from transformers import (
     AutoModelForSeq2SeqLM,
 )
 # =============================================================================
 # Configuration
 # =============================================================================
@@ -31,6 +37,10 @@ MT_TGT_LANG = "yor_Latn"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 TORCH_DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
 # =============================================================================
 # Load models (runs once at startup)
@@ -39,7 +49,6 @@ TORCH_DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
 print(f"Device: {DEVICE} | Dtype: {TORCH_DTYPE}")
 print("Loading models...")
-# ASR
 print(f"  Loading ASR: {ASR_MODEL_ID}")
 asr_pipe = hf_pipeline(
     "automatic-speech-recognition",
@@ -47,19 +56,17 @@ asr_pipe = hf_pipeline(
     device=DEVICE,
     torch_dtype=TORCH_DTYPE,
 )
-print("  ASR loaded ✓")
-# MT
 print(f"  Loading MT: {MT_MODEL_ID}")
 mt_tokenizer = AutoTokenizer.from_pretrained(MT_MODEL_ID)
 mt_model = AutoModelForSeq2SeqLM.from_pretrained(
-    MT_MODEL_ID,
-    torch_dtype=TORCH_DTYPE,
 ).to(DEVICE)
 mt_tokenizer.src_lang = MT_SRC_LANG
-print("  MT loaded ✓")
-# TTS
 print(f"  Loading TTS: {TTS_MODEL_ID}")
 tts_pipe = hf_pipeline(
     "text-to-speech",
@@ -67,29 +74,23 @@ tts_pipe = hf_pipeline(
     device=DEVICE,
     torch_dtype=TORCH_DTYPE,
 )
-print("  TTS loaded ✓")
 print("All models loaded!")
 # =============================================================================
-# Pipeline functions (from working Colab notebook)
 # =============================================================================
 def split_into_sentences(text):
-    """Split raw ASR text into individual sentences for MT."""
     text = text.strip()
     if not text:
         return []
-    # Normalize case
     text = '. '.join(s.strip().capitalize() for s in text.split('. ') if s.strip())
-    # If text has punctuation, split on it
     if re.search(r'[.!?]', text):
         sentences = re.split(r'(?<=[.!?])\s+', text)
         return [s.strip() for s in sentences if s.strip()]
-    # No punctuation — split into ~12 word chunks
     words = text.split()
     MAX_WORDS = 12
     sentences = []
@@ -103,21 +104,19 @@ def split_into_sentences(text):
 def transcribe(audio_array, sample_rate=16000):
-    """ASR: English audio → English text."""
     result = asr_pipe(
         {"raw": audio_array, "sampling_rate": sample_rate},
-        chunk_length_s=15,
-        batch_size=1,
         return_timestamps=False,
     )
     return result["text"].strip()
 def translate_sentence(text, max_length=256):
-    """MT: Translate a single sentence from English to Yoruba."""
     inputs = mt_tokenizer(text, return_tensors="pt", truncation=True).to(DEVICE)
-    tgt_lang_id = mt_tokenizer.convert_tokens_to_ids(MT_TGT_LANG)
     with torch.no_grad():
         output_ids = mt_model.generate(
             **inputs,
@@ -131,124 +130,228 @@ def translate_sentence(text, max_length=256):
     return mt_tokenizer.decode(output_ids[0], skip_special_tokens=True)
-def translate_long_text(text):
-    """Split into sentences and translate each individually."""
     sentences = split_into_sentences(text)
-    translations = []
-    for sent in sentences:
-        yo = translate_sentence(sent)
-        translations.append(yo)
-    return ' '.join(translations), sentences, translations
 def synthesize(text):
-    """TTS: Yoruba text → audio."""
     result = tts_pipe(text)
     audio = np.array(result["audio"]).squeeze()
     sr = result["sampling_rate"]
     return audio, sr
 # =============================================================================
 # Gradio interface functions
 # =============================================================================
-def process_audio(audio_input):
-    """
-    Full pipeline: English audio → Yoruba audio.
-    audio_input: tuple of (sample_rate, numpy_array) from Gradio.
-    """
     if audio_input is None:
-        return None, "⚠️ No audio provided. Please upload or record audio."
     sample_rate, audio_array = audio_input
-    # Convert to float32 mono if needed
     audio_array = audio_array.astype(np.float32)
     if audio_array.ndim > 1:
         audio_array = audio_array.mean(axis=1)
-    # Normalize to [-1, 1] if integer audio
     if audio_array.max() > 1.0 or audio_array.min() < -1.0:
         audio_array = audio_array / max(abs(audio_array.max()), abs(audio_array.min()))
     total_start = time.time()
-    log_lines = []
-    # Step 1: ASR
     t0 = time.time()
-    english_text = transcribe(audio_array, sample_rate)
-    asr_time = time.time() - t0
-    log_lines.append(f"**🎤 ASR** ({asr_time:.2f}s)")
-    log_lines.append(f"English: {english_text}")
-    log_lines.append("")
-    if not english_text:
-        return None, "⚠️ ASR returned empty text. Please try with clearer audio."
-    # Step 2: MT (sentence by sentence)
-    t0 = time.time()
-    yoruba_text, en_sentences, yo_sentences = translate_long_text(english_text)
-    mt_time = time.time() - t0
-    log_lines.append(f"**🔄 Translation** ({mt_time:.2f}s)")
-    for en_s, yo_s in zip(en_sentences, yo_sentences):
-        log_lines.append(f"  EN: {en_s}")
-        log_lines.append(f"  YO: {yo_s}")
-    log_lines.append("")
-    if not yoruba_text:
-        return None, "⚠️ Translation returned empty text."
-    # Step 3: TTS
     t0 = time.time()
-    yoruba_audio, output_sr = synthesize(yoruba_text)
-    tts_time = time.time() - t0
-    log_lines.append(f"**🔊 TTS** ({tts_time:.2f}s) → {len(yoruba_audio)/output_sr:.2f}s of audio")
-    total = time.time() - total_start
-    log_lines.append("")
-    log_lines.append(f"**Total: {total:.2f}s**")
-    log_output = "\n".join(log_lines)
-    return (output_sr, yoruba_audio), log_output
-def process_text(english_text):
-    """
-    Text-only mode: English text → Yoruba text + audio.
-    Skips the ASR stage — useful for testing MT + TTS.
-    """
-    if not english_text or not english_text.strip():
-        return None, "⚠️ Please enter some English text."
-    total_start = time.time()
-    log_lines = []
     # MT
     t0 = time.time()
-    yoruba_text, en_sentences, yo_sentences = translate_long_text(english_text.strip())
-    mt_time = time.time() - t0
-    log_lines.append(f"**🔄 Translation** ({mt_time:.2f}s)")
-    for en_s, yo_s in zip(en_sentences, yo_sentences):
-        log_lines.append(f"  EN: {en_s}")
-        log_lines.append(f"  YO: {yo_s}")
-    log_lines.append("")
-    if not yoruba_text:
-        return None, "⚠️ Translation returned empty text."
     # TTS
     t0 = time.time()
-    yoruba_audio, output_sr = synthesize(yoruba_text)
-    tts_time = time.time() - t0
-    log_lines.append(f"**🔊 TTS** ({tts_time:.2f}s) → {len(yoruba_audio)/output_sr:.2f}s of audio")
-    total = time.time() - total_start
-    log_lines.append("")
-    log_lines.append(f"**Total: {total:.2f}s**")
-    return (output_sr, yoruba_audio), "\n".join(log_lines)
 # =============================================================================
@@ -256,15 +359,24 @@ def process_text(english_text):
 # =============================================================================
 DESCRIPTION = """
-# 🏟️ Live Football Commentary — English → Yoruba
 Translate English football commentary into Yoruba speech in real-time.
-**Pipeline:** ASR (Whisper) → MT (NLLB-200) → TTS (MMS-TTS Yoruba)
-Upload or record English commentary audio, and get back Yoruba audio + full transcript.
 """
 EXAMPLES_TEXT = [
     "And it's a brilliant goal from the striker!",
     "The referee has shown a yellow card. Corner kick for the home team.",
@@ -273,7 +385,7 @@ EXAMPLES_TEXT = [
 ]
 with gr.Blocks(
-    title="Football Commentary EN→YO",
     theme=gr.themes.Soft(),
 ) as demo:
@@ -281,9 +393,47 @@ with gr.Blocks(
     with gr.Tabs():
-        # ---- Tab 1: Audio → Audio (Full Pipeline) ----
-        with gr.TabItem("🎙️ Audio → Audio (Full Pipeline)"):
-            gr.Markdown("Upload or record English commentary. The pipeline will transcribe, translate, and synthesize Yoruba audio.")
             with gr.Row():
                 with gr.Column():
@@ -292,21 +442,21 @@ with gr.Blocks(
                         type="numpy",
                         sources=["upload", "microphone"],
                     )
-                    audio_submit_btn = gr.Button("Translate to Yoruba", variant="primary", size="lg")
                 with gr.Column():
                     audio_output = gr.Audio(label="Yoruba Commentary Audio", type="numpy")
                     audio_log = gr.Markdown(label="Pipeline Log")
-            audio_submit_btn.click(
-                fn=process_audio,
                 inputs=[audio_input],
                 outputs=[audio_output, audio_log],
             )
-        # ---- Tab 2: Text → Audio (Skip ASR) ----
-        with gr.TabItem("📝 Text → Audio (Translation + TTS)"):
-            gr.Markdown("Type or paste English text to translate to Yoruba and hear the result. Useful for testing without audio.")
             with gr.Row():
                 with gr.Column():
@@ -315,8 +465,7 @@ with gr.Blocks(
                         placeholder="Type English football commentary here...",
                         lines=4,
                     )
-                    text_submit_btn = gr.Button("Translate to Yoruba", variant="primary", size="lg")
                     gr.Examples(
                         examples=[[e] for e in EXAMPLES_TEXT],
                         inputs=[text_input],
@@ -327,20 +476,19 @@ with gr.Blocks(
                     text_audio_output = gr.Audio(label="Yoruba Audio", type="numpy")
                     text_log = gr.Markdown(label="Pipeline Log")
-            text_submit_btn.click(
-                fn=process_text,
                 inputs=[text_input],
                 outputs=[text_audio_output, text_log],
             )
     gr.Markdown("""
 ---
-**Models used:**
 [ASR: PlotweaverAI/whisper-small-de-en](https://huggingface.co/PlotweaverAI/whisper-small-de-en) |
 [MT: PlotweaverAI/nllb-200-distilled-600M-african-6lang](https://huggingface.co/PlotweaverAI/nllb-200-distilled-600M-african-6lang) |
 [TTS: PlotweaverAI/yoruba-mms-tts-new](https://huggingface.co/PlotweaverAI/yoruba-mms-tts-new)
 """)
-# Launch
 if __name__ == "__main__":
     demo.launch()

 """
+Live Football Commentary Pipeline — Real-Time Streaming
+========================================================
+English → Yoruba with ~3-5 second latency.
+Uses Gradio's streaming audio API to continuously capture mic input,
+process chunks through ASR → MT → TTS, and play back Yoruba audio.
 """
 import torch
 import numpy as np
 import re
 import time
+import io
+import logging
 import gradio as gr
 from transformers import (
     pipeline as hf_pipeline,
     AutoModelForSeq2SeqLM,
 )
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
+logger = logging.getLogger(__name__)
 # =============================================================================
 # Configuration
 # =============================================================================
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 TORCH_DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
+# Streaming config
+CHUNK_DURATION_S = 5  # Process every N seconds of audio
+TARGET_SR = 16000     # Whisper expects 16kHz
 # =============================================================================
 # Load models (runs once at startup)
 print(f"Device: {DEVICE} | Dtype: {TORCH_DTYPE}")
 print("Loading models...")
 print(f"  Loading ASR: {ASR_MODEL_ID}")
 asr_pipe = hf_pipeline(
     "automatic-speech-recognition",
     device=DEVICE,
     torch_dtype=TORCH_DTYPE,
 )
+print("  ASR loaded")
 print(f"  Loading MT: {MT_MODEL_ID}")
 mt_tokenizer = AutoTokenizer.from_pretrained(MT_MODEL_ID)
 mt_model = AutoModelForSeq2SeqLM.from_pretrained(
+    MT_MODEL_ID, torch_dtype=TORCH_DTYPE
 ).to(DEVICE)
 mt_tokenizer.src_lang = MT_SRC_LANG
+tgt_lang_id = mt_tokenizer.convert_tokens_to_ids(MT_TGT_LANG)
+print(f"  MT loaded (target token id: {tgt_lang_id})")
 print(f"  Loading TTS: {TTS_MODEL_ID}")
 tts_pipe = hf_pipeline(
     "text-to-speech",
     device=DEVICE,
     torch_dtype=TORCH_DTYPE,
 )
+print("  TTS loaded")
 print("All models loaded!")
 # =============================================================================
+# Pipeline functions
 # =============================================================================
 def split_into_sentences(text):
+    """Split raw ASR text into individual sentences."""
     text = text.strip()
     if not text:
         return []
     text = '. '.join(s.strip().capitalize() for s in text.split('. ') if s.strip())
     if re.search(r'[.!?]', text):
         sentences = re.split(r'(?<=[.!?])\s+', text)
         return [s.strip() for s in sentences if s.strip()]
     words = text.split()
     MAX_WORDS = 12
     sentences = []
 def transcribe(audio_array, sample_rate=16000):
+    """ASR: English audio to text."""
+    if len(audio_array) < 1600:  # Less than 0.1s
+        return ""
     result = asr_pipe(
         {"raw": audio_array, "sampling_rate": sample_rate},
         return_timestamps=False,
     )
     return result["text"].strip()
 def translate_sentence(text, max_length=256):
+    """MT: Single sentence English to Yoruba."""
     inputs = mt_tokenizer(text, return_tensors="pt", truncation=True).to(DEVICE)
     with torch.no_grad():
         output_ids = mt_model.generate(
             **inputs,
     return mt_tokenizer.decode(output_ids[0], skip_special_tokens=True)
+def translate_text(text):
+    """Split and translate sentence by sentence."""
     sentences = split_into_sentences(text)
+    if not sentences:
+        return ""
+    translations = [translate_sentence(s) for s in sentences]
+    return ' '.join(translations)
 def synthesize(text):
+    """TTS: Yoruba text to audio."""
+    if not text.strip():
+        return np.array([], dtype=np.float32), TARGET_SR
     result = tts_pipe(text)
     audio = np.array(result["audio"]).squeeze()
     sr = result["sampling_rate"]
     return audio, sr
+def process_chunk(audio_array, sample_rate):
+    """Full pipeline on a single audio chunk."""
+    t_start = time.time()
+    # ASR
+    english = transcribe(audio_array, sample_rate)
+    if not english:
+        return None, None, "", "", 0
+    # MT
+    yoruba = translate_text(english)
+    if not yoruba:
+        return None, None, english, "", 0
+    # TTS
+    audio_out, sr_out = synthesize(yoruba)
+    if len(audio_out) == 0:
+        return None, None, english, yoruba, 0
+    elapsed = time.time() - t_start
+    logger.info(f"Chunk processed in {elapsed:.2f}s: EN='{english[:60]}' -> YO='{yoruba[:60]}'")
+    return audio_out, sr_out, english, yoruba, elapsed
+# =============================================================================
+# Streaming state management
+# =============================================================================
+class StreamState:
+    """Manages the audio buffer for streaming mode."""
+    def __init__(self, chunk_duration_s=CHUNK_DURATION_S):
+        self.chunk_duration_s = chunk_duration_s
+        self.audio_buffer = np.array([], dtype=np.float32)
+        self.buffer_sr = TARGET_SR
+        self.transcript_en = []
+        self.transcript_yo = []
+        self.chunk_count = 0
+        self.total_time = 0.0
+    def reset(self):
+        self.audio_buffer = np.array([], dtype=np.float32)
+        self.transcript_en = []
+        self.transcript_yo = []
+        self.chunk_count = 0
+        self.total_time = 0.0
 # =============================================================================
 # Gradio interface functions
 # =============================================================================
+def process_audio_upload(audio_input):
+    """Batch mode: upload/record full audio, get translation back."""
     if audio_input is None:
+        return None, "Please upload or record audio."
     sample_rate, audio_array = audio_input
     audio_array = audio_array.astype(np.float32)
     if audio_array.ndim > 1:
         audio_array = audio_array.mean(axis=1)
     if audio_array.max() > 1.0 or audio_array.min() < -1.0:
         audio_array = audio_array / max(abs(audio_array.max()), abs(audio_array.min()))
     total_start = time.time()
+    log = []
+    # ASR
     t0 = time.time()
+    english = transcribe(audio_array, sample_rate)
+    log.append(f"**ASR** ({time.time()-t0:.2f}s)\n{english}")
+    if not english:
+        return None, "ASR returned empty text. Try clearer audio."
+    # MT
     t0 = time.time()
+    sentences = split_into_sentences(english)
+    translations = []
+    for s in sentences:
+        yo = translate_sentence(s)
+        translations.append(yo)
+        log.append(f"  EN: {s}\n  YO: {yo}")
+    yoruba = ' '.join(translations)
+    log.append(f"**MT** ({time.time()-t0:.2f}s)")
+    if not yoruba:
+        return None, "Translation returned empty."
+    # TTS
+    t0 = time.time()
+    audio_out, sr_out = synthesize(yoruba)
+    log.append(f"**TTS** ({time.time()-t0:.2f}s) = {len(audio_out)/sr_out:.1f}s audio")
+    log.append(f"\n**Total: {time.time()-total_start:.2f}s**")
+    return (sr_out, audio_out), "\n".join(log)
+def process_text_input(text):
+    """Text mode: type English, get Yoruba audio."""
+    if not text or not text.strip():
+        return None, "Please enter some English text."
+    t_total = time.time()
+    log = []
     # MT
     t0 = time.time()
+    sentences = split_into_sentences(text.strip())
+    translations = []
+    for s in sentences:
+        yo = translate_sentence(s)
+        translations.append(yo)
+        log.append(f"EN: {s}\nYO: {yo}\n")
+    yoruba = ' '.join(translations)
+    log.append(f"**MT** ({time.time()-t0:.2f}s)")
     # TTS
     t0 = time.time()
+    audio_out, sr_out = synthesize(yoruba)
+    log.append(f"**TTS** ({time.time()-t0:.2f}s) = {len(audio_out)/sr_out:.1f}s audio")
+    log.append(f"\n**Total: {time.time()-t_total:.2f}s**")
+    return (sr_out, audio_out), "\n".join(log)
+def streaming_process(audio_input, state):
+    """
+    Streaming mode: receives audio chunks from the microphone,
+    buffers them, and processes when enough has accumulated.
+    This function is called repeatedly by Gradio's streaming API
+    each time a new audio chunk arrives from the mic.
+    """
+    if state is None:
+        state = StreamState()
+    if audio_input is None:
+        return None, format_live_log(state), state
+    sample_rate, audio_chunk = audio_input
+    audio_chunk = audio_chunk.astype(np.float32)
+    if audio_chunk.ndim > 1:
+        audio_chunk = audio_chunk.mean(axis=1)
+    if audio_chunk.max() > 1.0 or audio_chunk.min() < -1.0:
+        max_val = max(abs(audio_chunk.max()), abs(audio_chunk.min()))
+        if max_val > 0:
+            audio_chunk = audio_chunk / max_val
+    # Add to buffer
+    state.buffer_sr = sample_rate
+    state.audio_buffer = np.concatenate([state.audio_buffer, audio_chunk])
+    required_samples = int(state.chunk_duration_s * sample_rate)
+    # Not enough audio yet
+    if len(state.audio_buffer) < required_samples:
+        buffered_s = len(state.audio_buffer) / sample_rate
+        return None, format_live_log(state, buffered_s), state
+    # Extract chunk and process
+    chunk = state.audio_buffer[:required_samples]
+    state.audio_buffer = state.audio_buffer[required_samples:]
+    audio_out, sr_out, english, yoruba, elapsed = process_chunk(chunk, sample_rate)
+    if english:
+        state.chunk_count += 1
+        state.total_time += elapsed
+        state.transcript_en.append(english)
+        state.transcript_yo.append(yoruba)
+    if audio_out is not None and len(audio_out) > 0:
+        return (sr_out, audio_out), format_live_log(state), state
+    else:
+        return None, format_live_log(state), state
+def format_live_log(state, buffered_s=None):
+    """Format the live transcript log."""
+    lines = [f"**Chunks processed:** {state.chunk_count}"]
+    if state.chunk_count > 0:
+        avg = state.total_time / state.chunk_count
+        lines.append(f"**Avg processing time:** {avg:.2f}s per chunk")
+    if buffered_s is not None:
+        lines.append(f"**Buffering:** {buffered_s:.1f}s / {CHUNK_DURATION_S}s")
+    lines.append("")
+    lines.append("---")
+    lines.append("**Live transcript:**\n")
+    # Show last 10 chunks
+    start = max(0, len(state.transcript_en) - 10)
+    for i in range(start, len(state.transcript_en)):
+        lines.append(f"**[{i+1}]** EN: {state.transcript_en[i]}")
+        lines.append(f"    YO: {state.transcript_yo[i]}\n")
+    return "\n".join(lines)
+def clear_stream_state():
+    """Reset the streaming state."""
+    return None, "Stream cleared. Click Start to begin.", StreamState()
 # =============================================================================
 # =============================================================================
 DESCRIPTION = """
+# Live Football Commentary \u2014 English \u2192 Yoruba
 Translate English football commentary into Yoruba speech in real-time.
+**Pipeline:** ASR (Whisper) \u2192 MT (NLLB-200) \u2192 TTS (MMS-TTS Yoruba)
 """
+STREAMING_INSTRUCTIONS = """
+### How to use live streaming:
+1. Click the **microphone** button to start recording
+2. Speak English commentary naturally
+3. Every **{chunk_dur}s**, the pipeline processes your audio and plays back Yoruba
+4. The transcript updates live below
+5. Click **Clear** to reset
+**Expected latency:** ~3\u20135 seconds behind your speech.
+""".format(chunk_dur=CHUNK_DURATION_S)
 EXAMPLES_TEXT = [
     "And it's a brilliant goal from the striker!",
     "The referee has shown a yellow card. Corner kick for the home team.",
 ]
 with gr.Blocks(
+    title="Football Commentary EN\u2192YO",
     theme=gr.themes.Soft(),
 ) as demo:
     with gr.Tabs():
+        # ---- Tab 1: LIVE STREAMING ----
+        with gr.TabItem("Live Streaming"):
+            gr.Markdown(STREAMING_INSTRUCTIONS)
+            stream_state = gr.State(StreamState())
+            with gr.Row():
+                with gr.Column():
+                    stream_input = gr.Audio(
+                        label="Microphone (streaming)",
+                        type="numpy",
+                        sources=["microphone"],
+                        streaming=True,
+                    )
+                    clear_btn = gr.Button("Clear & Reset", variant="secondary")
+                with gr.Column():
+                    stream_output = gr.Audio(
+                        label="Yoruba Output",
+                        type="numpy",
+                        autoplay=True,
+                    )
+                    stream_log = gr.Markdown(
+                        label="Live Transcript",
+                        value="Waiting for audio input..."
+                    )
+            stream_input.stream(
+                fn=streaming_process,
+                inputs=[stream_input, stream_state],
+                outputs=[stream_output, stream_log, stream_state],
+            )
+            clear_btn.click(
+                fn=clear_stream_state,
+                outputs=[stream_output, stream_log, stream_state],
+            )
+        # ---- Tab 2: Upload/Record (Batch) ----
+        with gr.TabItem("Upload / Record (Batch)"):
+            gr.Markdown("Upload or record English commentary. Full pipeline processes after recording.")
             with gr.Row():
                 with gr.Column():
                         type="numpy",
                         sources=["upload", "microphone"],
                     )
+                    audio_submit = gr.Button("Translate to Yoruba", variant="primary", size="lg")
                 with gr.Column():
                     audio_output = gr.Audio(label="Yoruba Commentary Audio", type="numpy")
                     audio_log = gr.Markdown(label="Pipeline Log")
+            audio_submit.click(
+                fn=process_audio_upload,
                 inputs=[audio_input],
                 outputs=[audio_output, audio_log],
             )
+        # ---- Tab 3: Text Input ----
+        with gr.TabItem("Text \u2192 Audio"):
+            gr.Markdown("Type English text to translate to Yoruba and hear the result.")
             with gr.Row():
                 with gr.Column():
                         placeholder="Type English football commentary here...",
                         lines=4,
                     )
+                    text_submit = gr.Button("Translate to Yoruba", variant="primary", size="lg")
                     gr.Examples(
                         examples=[[e] for e in EXAMPLES_TEXT],
                         inputs=[text_input],
                     text_audio_output = gr.Audio(label="Yoruba Audio", type="numpy")
                     text_log = gr.Markdown(label="Pipeline Log")
+            text_submit.click(
+                fn=process_text_input,
                 inputs=[text_input],
                 outputs=[text_audio_output, text_log],
             )
     gr.Markdown("""
 ---
+**Models:**
 [ASR: PlotweaverAI/whisper-small-de-en](https://huggingface.co/PlotweaverAI/whisper-small-de-en) |
 [MT: PlotweaverAI/nllb-200-distilled-600M-african-6lang](https://huggingface.co/PlotweaverAI/nllb-200-distilled-600M-african-6lang) |
 [TTS: PlotweaverAI/yoruba-mms-tts-new](https://huggingface.co/PlotweaverAI/yoruba-mms-tts-new)
 """)
 if __name__ == "__main__":
     demo.launch()