Upload app.py
Browse files
app.py
CHANGED
|
@@ -533,29 +533,70 @@ def dub_video(video_path, progress=gr.Progress()):
|
|
| 533 |
if not english_text:
|
| 534 |
return None, "ASR returned empty text. The video may have no audible speech."
|
| 535 |
|
| 536 |
-
# Step 3: Translate (
|
| 537 |
-
progress(0.
|
| 538 |
t0 = time.time()
|
| 539 |
sentences = split_into_sentences(english_text)
|
|
|
|
|
|
|
|
|
|
| 540 |
translations = []
|
| 541 |
-
for s in sentences:
|
| 542 |
-
|
|
|
|
|
|
|
| 543 |
translations.append(yo)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 544 |
yoruba_text = ' '.join(translations)
|
| 545 |
-
log_lines.append(f"
|
| 546 |
log_lines.append(f"{yoruba_text[:300]}{'...' if len(yoruba_text) > 300 else ''}")
|
| 547 |
|
| 548 |
if not yoruba_text:
|
| 549 |
return None, "Translation returned empty text."
|
| 550 |
|
| 551 |
-
# Step 4: TTS
|
| 552 |
-
progress(0.
|
| 553 |
t0 = time.time()
|
| 554 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 555 |
sf.write(yoruba_audio_raw, yoruba_audio, output_sr)
|
| 556 |
yoruba_duration = len(yoruba_audio) / output_sr
|
| 557 |
log_lines.append(f"\n**TTS** ({time.time()-t0:.2f}s)")
|
| 558 |
-
log_lines.append(f"Generated {yoruba_duration:.1f}s of Yoruba audio")
|
| 559 |
|
| 560 |
# Step 5: Time-align Yoruba audio to match video duration
|
| 561 |
progress(0.85, desc="Aligning audio to video duration...")
|
|
|
|
| 533 |
if not english_text:
|
| 534 |
return None, "ASR returned empty text. The video may have no audible speech."
|
| 535 |
|
| 536 |
+
# Step 3: Translate (fast greedy decoding for speed - still good quality)
|
| 537 |
+
progress(0.3, desc="Translating English to Yoruba...")
|
| 538 |
t0 = time.time()
|
| 539 |
sentences = split_into_sentences(english_text)
|
| 540 |
+
n_sentences = len(sentences)
|
| 541 |
+
log_lines.append(f"\n**MT** starting ({n_sentences} sentences)")
|
| 542 |
+
|
| 543 |
translations = []
|
| 544 |
+
for i, s in enumerate(sentences):
|
| 545 |
+
# Fast mode (greedy) is 3-4x faster than beam search
|
| 546 |
+
# Still produces good quality for most sentences
|
| 547 |
+
yo = translate_sentence(s, fast=True)
|
| 548 |
translations.append(yo)
|
| 549 |
+
# Update progress per sentence
|
| 550 |
+
mt_progress = 0.3 + (0.35 * (i + 1) / n_sentences)
|
| 551 |
+
progress(mt_progress, desc=f"Translating {i+1}/{n_sentences}...")
|
| 552 |
+
|
| 553 |
yoruba_text = ' '.join(translations)
|
| 554 |
+
log_lines.append(f"**MT** completed in {time.time()-t0:.2f}s")
|
| 555 |
log_lines.append(f"{yoruba_text[:300]}{'...' if len(yoruba_text) > 300 else ''}")
|
| 556 |
|
| 557 |
if not yoruba_text:
|
| 558 |
return None, "Translation returned empty text."
|
| 559 |
|
| 560 |
+
# Step 4: TTS - chunk long text into sentence groups to avoid hanging
|
| 561 |
+
progress(0.65, desc="Synthesizing Yoruba speech...")
|
| 562 |
t0 = time.time()
|
| 563 |
+
|
| 564 |
+
# Split Yoruba text into chunks of ~3 sentences each for faster TTS
|
| 565 |
+
yoruba_sentences = re.split(r'(?<=[.!?])\s+', yoruba_text)
|
| 566 |
+
yoruba_sentences = [s.strip() for s in yoruba_sentences if s.strip()]
|
| 567 |
+
n_yo = len(yoruba_sentences)
|
| 568 |
+
|
| 569 |
+
SENTENCES_PER_TTS_CHUNK = 2
|
| 570 |
+
audio_segments = []
|
| 571 |
+
output_sr = None
|
| 572 |
+
|
| 573 |
+
for i in range(0, n_yo, SENTENCES_PER_TTS_CHUNK):
|
| 574 |
+
chunk_sents = yoruba_sentences[i:i + SENTENCES_PER_TTS_CHUNK]
|
| 575 |
+
chunk_text = ' '.join(chunk_sents)
|
| 576 |
+
if not chunk_text:
|
| 577 |
+
continue
|
| 578 |
+
|
| 579 |
+
audio_seg, seg_sr = synthesize(chunk_text)
|
| 580 |
+
if output_sr is None:
|
| 581 |
+
output_sr = seg_sr
|
| 582 |
+
if len(audio_seg) > 0:
|
| 583 |
+
audio_segments.append(audio_seg)
|
| 584 |
+
# Add small silence between chunks (200ms)
|
| 585 |
+
silence = np.zeros(int(0.2 * seg_sr), dtype=np.float32)
|
| 586 |
+
audio_segments.append(silence)
|
| 587 |
+
|
| 588 |
+
# Update progress per TTS chunk
|
| 589 |
+
tts_progress = 0.65 + (0.2 * (i + SENTENCES_PER_TTS_CHUNK) / n_yo)
|
| 590 |
+
progress(min(tts_progress, 0.85), desc=f"Synthesizing audio {min(i+SENTENCES_PER_TTS_CHUNK, n_yo)}/{n_yo}...")
|
| 591 |
+
|
| 592 |
+
if not audio_segments:
|
| 593 |
+
return None, "TTS produced no audio."
|
| 594 |
+
|
| 595 |
+
yoruba_audio = np.concatenate(audio_segments)
|
| 596 |
sf.write(yoruba_audio_raw, yoruba_audio, output_sr)
|
| 597 |
yoruba_duration = len(yoruba_audio) / output_sr
|
| 598 |
log_lines.append(f"\n**TTS** ({time.time()-t0:.2f}s)")
|
| 599 |
+
log_lines.append(f"Generated {yoruba_duration:.1f}s of Yoruba audio ({n_yo} sentences)")
|
| 600 |
|
| 601 |
# Step 5: Time-align Yoruba audio to match video duration
|
| 602 |
progress(0.85, desc="Aligning audio to video duration...")
|