Spaces:

artificialguybr
/

video-dubbing

Running on Zero

App Files Files Community

artificialguybr commited on Jul 5

Commit

75517c0

•

1 Parent(s): 285da88

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -11

app.py CHANGED Viewed

@@ -5,7 +5,8 @@ import subprocess
 import os, stat
 import uuid
 from googletrans import Translator
-from TTS.api import TTS
 import ffmpeg
 import json
 from scipy.signal import wiener
@@ -24,7 +25,6 @@ from huggingface_hub import HfApi
 import moviepy.editor as mp
 HF_TOKEN = os.environ.get("HF_TOKEN")
-os.environ["COQUI_TOS_AGREED"] = "1"
 api = HfApi(token=HF_TOKEN)
 repo_id = "artificialguybr/video-dubbing"
 ZipFile("ffmpeg.zip").extractall()
@@ -121,6 +121,10 @@ def transcribe_audio(file_path):
     return result
 @spaces.GPU
 def process_video(radio, video, target_language, has_closeup_face):
     try:
@@ -156,15 +160,34 @@ def process_video(radio, video, target_language, has_closeup_face):
             print(f"Error encountered during transcription: {str(e)}")
             raise
-        language_mapping = {'English': 'en', 'Spanish': 'es', 'French': 'fr', 'German': 'de', 'Italian': 'it', 'Portuguese': 'pt', 'Polish': 'pl', 'Turkish': 'tr', 'Russian': 'ru', 'Dutch': 'nl', 'Czech': 'cs', 'Arabic': 'ar', 'Chinese (Simplified)': 'zh-cn'}
-        target_language_code = language_mapping[target_language]
         translator = Translator()
         translated_text = translator.translate(whisper_text, dest=target_language_code).text
         print(translated_text)
-        tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
-        tts.to('cuda')
-        tts.tts_to_file(translated_text, speaker_wav=f"{run_uuid}_output_audio_final.wav", file_path=f"{run_uuid}_output_synth.wav", language=target_language_code)
         pad_top = 0
         pad_bottom = 15
@@ -228,7 +251,7 @@ iface = gr.Interface(
     inputs=[
         radio,
         video,
-        gr.Dropdown(choices=["English", "Spanish", "French", "German", "Italian", "Portuguese", "Polish", "Turkish", "Russian", "Dutch", "Czech", "Arabic", "Chinese (Simplified)"], label="Target Language for Dubbing", value="Spanish"),
         gr.Checkbox(
                     label="Video has a close-up face. Use Wav2lip.",
                     value=False,
@@ -246,10 +269,9 @@ with gr.Blocks() as demo:
     radio.change(swap, inputs=[radio], outputs=video)
     gr.Markdown("""
     **Note:**
-    - Video limit is 1 minute. It will dubbling all people using just one voice.
     - Generation may take up to 5 minutes.
-    - By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml
-    - The tool uses open-source models for all models. It's a alpha version.
     - Quality can be improved but would require more processing time per video. For scalability and hardware limitations, speed was chosen, not just quality.
     - If you need more than 1 minute, duplicate the Space and change the limit on app.py.
     - If you incorrectly mark the 'Video has a close-up face' checkbox, the dubbing may not work as expected.

 import os, stat
 import uuid
 from googletrans import Translator
+import edge_tts
+import asyncio
 import ffmpeg
 import json
 from scipy.signal import wiener
 import moviepy.editor as mp
 HF_TOKEN = os.environ.get("HF_TOKEN")
 api = HfApi(token=HF_TOKEN)
 repo_id = "artificialguybr/video-dubbing"
 ZipFile("ffmpeg.zip").extractall()
     return result
+async def text_to_speech(text, voice, output_file):
+    communicate = edge_tts.Communicate(text, voice)
+    await communicate.save(output_file)
 @spaces.GPU
 def process_video(radio, video, target_language, has_closeup_face):
     try:
             print(f"Error encountered during transcription: {str(e)}")
             raise
+        language_mapping = {
+            'English': ('en', 'en-US-EricNeural'),
+            'Spanish': ('es', 'es-ES-AlvaroNeural'),
+            'French': ('fr', 'fr-FR-HenriNeural'),
+            'German': ('de', 'de-DE-ConradNeural'),
+            'Italian': ('it', 'it-IT-DiegoNeural'),
+            'Portuguese': ('pt', 'pt-PT-DuarteNeural'),
+            'Polish': ('pl', 'pl-PL-MarekNeural'),
+            'Turkish': ('tr', 'tr-TR-AhmetNeural'),
+            'Russian': ('ru', 'ru-RU-DmitryNeural'),
+            'Dutch': ('nl', 'nl-NL-MaartenNeural'),
+            'Czech': ('cs', 'cs-CZ-AntoninNeural'),
+            'Arabic': ('ar', 'ar-SA-HamedNeural'),
+            'Chinese (Simplified)': ('zh-CN', 'zh-CN-YunxiNeural'),
+            'Japanese': ('ja', 'ja-JP-KeitaNeural'),
+            'Korean': ('ko', 'ko-KR-InJoonNeural'),
+            'Hindi': ('hi', 'hi-IN-MadhurNeural'),
+            'Swedish': ('sv', 'sv-SE-MattiasNeural'),
+            'Danish': ('da', 'da-DK-JeppeNeural'),
+            'Finnish': ('fi', 'fi-FI-HarriNeural'),
+            'Greek': ('el', 'el-GR-NestorasNeural')
+        }
+        target_language_code, voice = language_mapping[target_language]
         translator = Translator()
         translated_text = translator.translate(whisper_text, dest=target_language_code).text
         print(translated_text)
+        asyncio.run(text_to_speech(translated_text, voice, f"{run_uuid}_output_synth.wav"))
         pad_top = 0
         pad_bottom = 15
     inputs=[
         radio,
         video,
+        gr.Dropdown(choices=["English", "Spanish", "French", "German", "Italian", "Portuguese", "Polish", "Turkish", "Russian", "Dutch", "Czech", "Arabic", "Chinese (Simplified)", "Japanese", "Korean", "Hindi", "Swedish", "Danish", "Finnish", "Greek"], label="Target Language for Dubbing", value="Spanish"),
         gr.Checkbox(
                     label="Video has a close-up face. Use Wav2lip.",
                     value=False,
     radio.change(swap, inputs=[radio], outputs=video)
     gr.Markdown("""
     **Note:**
+    - Video limit is 1 minute. It will dubbing all people using just one voice.
     - Generation may take up to 5 minutes.
+    - The tool uses open-source models for all models. It's an alpha version.
     - Quality can be improved but would require more processing time per video. For scalability and hardware limitations, speed was chosen, not just quality.
     - If you need more than 1 minute, duplicate the Space and change the limit on app.py.
     - If you incorrectly mark the 'Video has a close-up face' checkbox, the dubbing may not work as expected.