Spaces:

PlotweaverModel
/

Live-Football-Commentary

Running

App Files Files Community

PlotweaverModel commited on 12 days ago

Commit

fdd3dce

verified ·

1 Parent(s): 79f78ef

Upload app.py

Browse files

Files changed (1) hide show

app.py +50 -9

app.py CHANGED Viewed

@@ -533,29 +533,70 @@ def dub_video(video_path, progress=gr.Progress()):
         if not english_text:
             return None, "ASR returned empty text. The video may have no audible speech."
-        # Step 3: Translate (using beam search for best quality since this is batch)
-        progress(0.5, desc="Translating English to Yoruba...")
         t0 = time.time()
         sentences = split_into_sentences(english_text)
         translations = []
-        for s in sentences:
-            yo = translate_sentence(s, fast=False)  # beam search for quality
             translations.append(yo)
         yoruba_text = ' '.join(translations)
-        log_lines.append(f"\n**MT** ({time.time()-t0:.2f}s, {len(sentences)} sentences)")
         log_lines.append(f"{yoruba_text[:300]}{'...' if len(yoruba_text) > 300 else ''}")
         if not yoruba_text:
             return None, "Translation returned empty text."
-        # Step 4: TTS
-        progress(0.7, desc="Synthesizing Yoruba speech...")
         t0 = time.time()
-        yoruba_audio, output_sr = synthesize(yoruba_text)
         sf.write(yoruba_audio_raw, yoruba_audio, output_sr)
         yoruba_duration = len(yoruba_audio) / output_sr
         log_lines.append(f"\n**TTS** ({time.time()-t0:.2f}s)")
-        log_lines.append(f"Generated {yoruba_duration:.1f}s of Yoruba audio")
         # Step 5: Time-align Yoruba audio to match video duration
         progress(0.85, desc="Aligning audio to video duration...")

         if not english_text:
             return None, "ASR returned empty text. The video may have no audible speech."
+        # Step 3: Translate (fast greedy decoding for speed - still good quality)
+        progress(0.3, desc="Translating English to Yoruba...")
         t0 = time.time()
         sentences = split_into_sentences(english_text)
+        n_sentences = len(sentences)
+        log_lines.append(f"\n**MT** starting ({n_sentences} sentences)")
         translations = []
+        for i, s in enumerate(sentences):
+            # Fast mode (greedy) is 3-4x faster than beam search
+            # Still produces good quality for most sentences
+            yo = translate_sentence(s, fast=True)
             translations.append(yo)
+            # Update progress per sentence
+            mt_progress = 0.3 + (0.35 * (i + 1) / n_sentences)
+            progress(mt_progress, desc=f"Translating {i+1}/{n_sentences}...")
         yoruba_text = ' '.join(translations)
+        log_lines.append(f"**MT** completed in {time.time()-t0:.2f}s")
         log_lines.append(f"{yoruba_text[:300]}{'...' if len(yoruba_text) > 300 else ''}")
         if not yoruba_text:
             return None, "Translation returned empty text."
+        # Step 4: TTS - chunk long text into sentence groups to avoid hanging
+        progress(0.65, desc="Synthesizing Yoruba speech...")
         t0 = time.time()
+        # Split Yoruba text into chunks of ~3 sentences each for faster TTS
+        yoruba_sentences = re.split(r'(?<=[.!?])\s+', yoruba_text)
+        yoruba_sentences = [s.strip() for s in yoruba_sentences if s.strip()]
+        n_yo = len(yoruba_sentences)
+        SENTENCES_PER_TTS_CHUNK = 2
+        audio_segments = []
+        output_sr = None
+        for i in range(0, n_yo, SENTENCES_PER_TTS_CHUNK):
+            chunk_sents = yoruba_sentences[i:i + SENTENCES_PER_TTS_CHUNK]
+            chunk_text = ' '.join(chunk_sents)
+            if not chunk_text:
+                continue
+            audio_seg, seg_sr = synthesize(chunk_text)
+            if output_sr is None:
+                output_sr = seg_sr
+            if len(audio_seg) > 0:
+                audio_segments.append(audio_seg)
+                # Add small silence between chunks (200ms)
+                silence = np.zeros(int(0.2 * seg_sr), dtype=np.float32)
+                audio_segments.append(silence)
+            # Update progress per TTS chunk
+            tts_progress = 0.65 + (0.2 * (i + SENTENCES_PER_TTS_CHUNK) / n_yo)
+            progress(min(tts_progress, 0.85), desc=f"Synthesizing audio {min(i+SENTENCES_PER_TTS_CHUNK, n_yo)}/{n_yo}...")
+        if not audio_segments:
+            return None, "TTS produced no audio."
+        yoruba_audio = np.concatenate(audio_segments)
         sf.write(yoruba_audio_raw, yoruba_audio, output_sr)
         yoruba_duration = len(yoruba_audio) / output_sr
         log_lines.append(f"\n**TTS** ({time.time()-t0:.2f}s)")
+        log_lines.append(f"Generated {yoruba_duration:.1f}s of Yoruba audio ({n_yo} sentences)")
         # Step 5: Time-align Yoruba audio to match video duration
         progress(0.85, desc="Aligning audio to video duration...")