Video_translation_with_speaker_diarization_and_voice_cloning_private

Build error

vitaliy-sharandin commited on Nov 19, 2023

Commit

3fd57b8

•

1 Parent(s): 2353dd0

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -161,10 +161,7 @@ def voice_cloning_translation(translated_transcription, speakers_voice_clips, ta
   final_audio_track = None
   try:
-    # TODO uncomment when https://github.com/coqui-ai/TTS/issues/3224 is resolved
-    # tts = TTS(selected_model).to(device)
-    # Generate and concatenate voice clips per speaker
     last_end_time = 0
     clips = []
@@ -184,9 +181,18 @@ def voice_cloning_translation(translated_transcription, speakers_voice_clips, ta
       # Generate speech
       print(f"[{speech_item['speaker']}]")
-      tts = TTS(selected_model).to(device)
-      audio = tts.tts_with_vc(text=speech_item['text'], speaker_wav=speakers_voice_clips[speech_item['speaker']], language=target_language)
-      sample_rate = tts.voice_converter.vc_config.audio.output_sample_rate
       # Adjust pace to fit the speech timeframe if translated audio is longer than phrase
       audio_duration = len(audio) / sample_rate
@@ -209,8 +215,6 @@ def voice_cloning_translation(translated_transcription, speakers_voice_clips, ta
       last_end_time = speech_item['start'] + audio_clip.duration
-      del tts; import gc; gc.collect(); torch.cuda.empty_cache()
     # Merge sentences
     final_audio_track = mp.concatenate_audioclips(clips)

   final_audio_track = None
   try:
+    tts = TTS(selected_model).to(device)
     last_end_time = 0
     clips = []
       # Generate speech
       print(f"[{speech_item['speaker']}]")
+      sample_rate = None
+      audio = None
+      if 'vits' in selected_model:
+        audio = tts.tts_with_vc(text=speech_item['text'], speaker_wav=speakers_voice_clips[speech_item['speaker']])
+        sample_rate = tts.voice_converter.vc_config.audio.output_sample_rate
+      else:
+        # TODO remove when https://github.com/coqui-ai/TTS/issues/3224 is resolved
+        tts = TTS(selected_model).to(device)
+        audio = tts.tts_with_vc(text=speech_item['text'], speaker_wav=speakers_voice_clips[speech_item['speaker']], language=target_language)
+        sample_rate = tts.voice_converter.vc_config.audio.output_sample_rate
+        del tts; import gc; gc.collect(); torch.cuda.empty_cache()
       # Adjust pace to fit the speech timeframe if translated audio is longer than phrase
       audio_duration = len(audio) / sample_rate
       last_end_time = speech_item['start'] + audio_clip.duration
     # Merge sentences
     final_audio_track = mp.concatenate_audioclips(clips)