PlotweaverModel commited on
Commit
fdd3dce
·
verified ·
1 Parent(s): 79f78ef

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -9
app.py CHANGED
@@ -533,29 +533,70 @@ def dub_video(video_path, progress=gr.Progress()):
533
  if not english_text:
534
  return None, "ASR returned empty text. The video may have no audible speech."
535
 
536
- # Step 3: Translate (using beam search for best quality since this is batch)
537
- progress(0.5, desc="Translating English to Yoruba...")
538
  t0 = time.time()
539
  sentences = split_into_sentences(english_text)
 
 
 
540
  translations = []
541
- for s in sentences:
542
- yo = translate_sentence(s, fast=False) # beam search for quality
 
 
543
  translations.append(yo)
 
 
 
 
544
  yoruba_text = ' '.join(translations)
545
- log_lines.append(f"\n**MT** ({time.time()-t0:.2f}s, {len(sentences)} sentences)")
546
  log_lines.append(f"{yoruba_text[:300]}{'...' if len(yoruba_text) > 300 else ''}")
547
 
548
  if not yoruba_text:
549
  return None, "Translation returned empty text."
550
 
551
- # Step 4: TTS
552
- progress(0.7, desc="Synthesizing Yoruba speech...")
553
  t0 = time.time()
554
- yoruba_audio, output_sr = synthesize(yoruba_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
555
  sf.write(yoruba_audio_raw, yoruba_audio, output_sr)
556
  yoruba_duration = len(yoruba_audio) / output_sr
557
  log_lines.append(f"\n**TTS** ({time.time()-t0:.2f}s)")
558
- log_lines.append(f"Generated {yoruba_duration:.1f}s of Yoruba audio")
559
 
560
  # Step 5: Time-align Yoruba audio to match video duration
561
  progress(0.85, desc="Aligning audio to video duration...")
 
533
  if not english_text:
534
  return None, "ASR returned empty text. The video may have no audible speech."
535
 
536
+ # Step 3: Translate (fast greedy decoding for speed - still good quality)
537
+ progress(0.3, desc="Translating English to Yoruba...")
538
  t0 = time.time()
539
  sentences = split_into_sentences(english_text)
540
+ n_sentences = len(sentences)
541
+ log_lines.append(f"\n**MT** starting ({n_sentences} sentences)")
542
+
543
  translations = []
544
+ for i, s in enumerate(sentences):
545
+ # Fast mode (greedy) is 3-4x faster than beam search
546
+ # Still produces good quality for most sentences
547
+ yo = translate_sentence(s, fast=True)
548
  translations.append(yo)
549
+ # Update progress per sentence
550
+ mt_progress = 0.3 + (0.35 * (i + 1) / n_sentences)
551
+ progress(mt_progress, desc=f"Translating {i+1}/{n_sentences}...")
552
+
553
  yoruba_text = ' '.join(translations)
554
+ log_lines.append(f"**MT** completed in {time.time()-t0:.2f}s")
555
  log_lines.append(f"{yoruba_text[:300]}{'...' if len(yoruba_text) > 300 else ''}")
556
 
557
  if not yoruba_text:
558
  return None, "Translation returned empty text."
559
 
560
+ # Step 4: TTS - chunk long text into sentence groups to avoid hanging
561
+ progress(0.65, desc="Synthesizing Yoruba speech...")
562
  t0 = time.time()
563
+
564
+ # Split Yoruba text into chunks of ~3 sentences each for faster TTS
565
+ yoruba_sentences = re.split(r'(?<=[.!?])\s+', yoruba_text)
566
+ yoruba_sentences = [s.strip() for s in yoruba_sentences if s.strip()]
567
+ n_yo = len(yoruba_sentences)
568
+
569
+ SENTENCES_PER_TTS_CHUNK = 2
570
+ audio_segments = []
571
+ output_sr = None
572
+
573
+ for i in range(0, n_yo, SENTENCES_PER_TTS_CHUNK):
574
+ chunk_sents = yoruba_sentences[i:i + SENTENCES_PER_TTS_CHUNK]
575
+ chunk_text = ' '.join(chunk_sents)
576
+ if not chunk_text:
577
+ continue
578
+
579
+ audio_seg, seg_sr = synthesize(chunk_text)
580
+ if output_sr is None:
581
+ output_sr = seg_sr
582
+ if len(audio_seg) > 0:
583
+ audio_segments.append(audio_seg)
584
+ # Add small silence between chunks (200ms)
585
+ silence = np.zeros(int(0.2 * seg_sr), dtype=np.float32)
586
+ audio_segments.append(silence)
587
+
588
+ # Update progress per TTS chunk
589
+ tts_progress = 0.65 + (0.2 * (i + SENTENCES_PER_TTS_CHUNK) / n_yo)
590
+ progress(min(tts_progress, 0.85), desc=f"Synthesizing audio {min(i+SENTENCES_PER_TTS_CHUNK, n_yo)}/{n_yo}...")
591
+
592
+ if not audio_segments:
593
+ return None, "TTS produced no audio."
594
+
595
+ yoruba_audio = np.concatenate(audio_segments)
596
  sf.write(yoruba_audio_raw, yoruba_audio, output_sr)
597
  yoruba_duration = len(yoruba_audio) / output_sr
598
  log_lines.append(f"\n**TTS** ({time.time()-t0:.2f}s)")
599
+ log_lines.append(f"Generated {yoruba_duration:.1f}s of Yoruba audio ({n_yo} sentences)")
600
 
601
  # Step 5: Time-align Yoruba audio to match video duration
602
  progress(0.85, desc="Aligning audio to video duration...")