Spaces:

artificialguybr
/

video-dubbing

Running on Zero

App Files Files Community

artificialguybr commited on Jul 5, 2024

Commit

49f95b1

verified ·

1 Parent(s): 4fe6158

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -53

app.py CHANGED Viewed

@@ -14,7 +14,6 @@ from huggingface_hub import HfApi
 import moviepy.editor as mp
 import spaces
 # Constants and initialization
 HF_TOKEN = os.environ.get("HF_TOKEN")
 REPO_ID = "artificialguybr/video-dubbing"
@@ -50,7 +49,6 @@ language_mapping = {
     'Greek': ('el', 'el-GR-NestorasNeural')
 }
 print("Starting the program...")
 def generate_unique_filename(extension):
@@ -62,20 +60,6 @@ def cleanup_files(*files):
             os.remove(file)
             print(f"Removed file: {file}")
-def check_for_faces(video_path):
-    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
-    cap = cv2.VideoCapture(video_path)
-    while True:
-        ret, frame = cap.read()
-        if not ret:
-            break
-        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
-        faces = face_cascade.detectMultiScale(gray, 1.1, 4)
-        if len(faces) > 0:
-            return True
-    return False
 @spaces.GPU(duration=90)
 def transcribe_audio(file_path):
     print(f"Starting transcription of file: {file_path}")
@@ -128,7 +112,7 @@ async def text_to_speech(text, voice, output_file):
     await communicate.save(output_file)
 @spaces.GPU
-def process_video(radio, video, target_language, has_closeup_face):
     try:
         if target_language is None:
             raise ValueError("Please select a Target Language for Dubbing.")
@@ -163,12 +147,12 @@ def process_video(radio, video, target_language, has_closeup_face):
         asyncio.run(text_to_speech(translated_text, voice, f"{run_uuid}_output_synth.wav"))
-        if has_closeup_face or check_for_faces(video_path):
             try:
                 subprocess.run(f"python Wav2Lip/inference.py --checkpoint_path 'Wav2Lip/checkpoints/wav2lip_gan.pth' --face '{video_path}' --audio '{run_uuid}_output_synth.wav' --pads 0 15 0 0 --resize_factor 1 --nosmooth --outfile '{run_uuid}_output_video.mp4'", shell=True, check=True)
             except subprocess.CalledProcessError as e:
                 print(f"Wav2Lip error: {str(e)}")
-                gr.Warning("Wav2lip didn't detect a face or encountered an error. Falling back to simple audio replacement.")
                 subprocess.run(f"ffmpeg -i {video_path} -i {run_uuid}_output_synth.wav -c:v copy -c:a aac -strict experimental -map 0:v:0 -map 1:a:0 {run_uuid}_output_video.mp4", shell=True, check=True)
         else:
             subprocess.run(f"ffmpeg -i {video_path} -i {run_uuid}_output_synth.wav -c:v copy -c:a aac -strict experimental -map 0:v:0 -map 1:a:0 {run_uuid}_output_video.mp4", shell=True, check=True)
@@ -190,43 +174,53 @@ def process_video(radio, video, target_language, has_closeup_face):
         print(f"Error in process_video: {str(e)}")
         return None, f"Error: {str(e)}"
-def swap(radio):
-    return gr.update(source="upload" if radio == "Upload" else "webcam")
 # Gradio interface setup
-video = gr.Video()
-radio = gr.Radio(["Upload", "Record"], value="Upload", show_label=False)
-iface = gr.Interface(
-    fn=process_video,
-    inputs=[
-        radio,
-        video,
-        gr.Dropdown(choices=list(language_mapping.keys()), label="Target Language for Dubbing", value="Spanish"),
-        gr.Checkbox(label="Video has a close-up face. Use Wav2lip.", value=False, info="Say if video have close-up face. For Wav2lip. Will not work if checked wrongly.")
-    ],
-    outputs=[
-        gr.Video(label="Processed Video"),
-        gr.Textbox(label="Error Message")
-    ],
-    live=False,
-    title="AI Video Dubbing",
-    description="""This tool was developed by [@artificialguybr](https://twitter.com/artificialguybr) using entirely open-source tools. Special thanks to Hugging Face for the GPU support. Thanks [@yeswondwer](https://twitter.com/@yeswondwerr) for original code. Test the [Video Transcription and Translate](https://huggingface.co/spaces/artificialguybr/VIDEO-TRANSLATION-TRANSCRIPTION) space!""",
-    allow_flagging=False
-)
-with gr.Blocks() as demo:
-    iface.render()
-    radio.change(swap, inputs=[radio], outputs=video)
     gr.Markdown("""
-    **Note:**
-    - Video limit is 1 minute. It will dubbing all people using just one voice.
-    - Generation may take up to 5 minutes.
-    - The tool uses open-source models for all models. It's an alpha version.
-    - Quality can be improved but would require more processing time per video. For scalability and hardware limitations, speed was chosen, not just quality.
-    - If you need more than 1 minute, duplicate the Space and change the limit on app.py.
-    - If you incorrectly mark the 'Video has a close-up face' checkbox, the dubbing may not work as expected.
     """)
 print("Launching Gradio interface...")
 demo.queue()
-demo.launch()

 import moviepy.editor as mp
 import spaces
 # Constants and initialization
 HF_TOKEN = os.environ.get("HF_TOKEN")
 REPO_ID = "artificialguybr/video-dubbing"
     'Greek': ('el', 'el-GR-NestorasNeural')
 }
 print("Starting the program...")
 def generate_unique_filename(extension):
             os.remove(file)
             print(f"Removed file: {file}")
 @spaces.GPU(duration=90)
 def transcribe_audio(file_path):
     print(f"Starting transcription of file: {file_path}")
     await communicate.save(output_file)
 @spaces.GPU
+def process_video(video, target_language, use_wav2lip):
     try:
         if target_language is None:
             raise ValueError("Please select a Target Language for Dubbing.")
         asyncio.run(text_to_speech(translated_text, voice, f"{run_uuid}_output_synth.wav"))
+        if use_wav2lip:
             try:
                 subprocess.run(f"python Wav2Lip/inference.py --checkpoint_path 'Wav2Lip/checkpoints/wav2lip_gan.pth' --face '{video_path}' --audio '{run_uuid}_output_synth.wav' --pads 0 15 0 0 --resize_factor 1 --nosmooth --outfile '{run_uuid}_output_video.mp4'", shell=True, check=True)
             except subprocess.CalledProcessError as e:
                 print(f"Wav2Lip error: {str(e)}")
+                gr.Warning("Wav2lip encountered an error. Falling back to simple audio replacement.")
                 subprocess.run(f"ffmpeg -i {video_path} -i {run_uuid}_output_synth.wav -c:v copy -c:a aac -strict experimental -map 0:v:0 -map 1:a:0 {run_uuid}_output_video.mp4", shell=True, check=True)
         else:
             subprocess.run(f"ffmpeg -i {video_path} -i {run_uuid}_output_synth.wav -c:v copy -c:a aac -strict experimental -map 0:v:0 -map 1:a:0 {run_uuid}_output_video.mp4", shell=True, check=True)
         print(f"Error in process_video: {str(e)}")
         return None, f"Error: {str(e)}"
 # Gradio interface setup
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# AI Video Dubbing")
+    gr.Markdown("This tool uses AI to dub videos into different languages. Upload a video, choose a target language, and get a dubbed version!")
+    with gr.Row():
+        with gr.Column(scale=2):
+            video_input = gr.Video(label="Upload Video")
+            target_language = gr.Dropdown(
+                choices=list(language_mapping.keys()),
+                label="Target Language for Dubbing",
+                value="Spanish"
+            )
+            use_wav2lip = gr.Checkbox(
+                label="Use Wav2Lip for lip sync",
+                value=False,
+                info="Enable this if the video has close-up faces. May not work for all videos."
+            )
+            submit_button = gr.Button("Process Video", variant="primary")
+        with gr.Column(scale=2):
+            output_video = gr.Video(label="Processed Video")
+            error_message = gr.Textbox(label="Status/Error Message")
+    submit_button.click(
+        process_video,
+        inputs=[video_input, target_language, use_wav2lip],
+        outputs=[output_video, error_message]
+    )
+    gr.Markdown("""
+    ## Notes:
+    - Video limit is 1 minute. The tool will dub all speakers using a single voice.
+    - Processing may take up to 5 minutes.
+    - This is an alpha version using open-source models.
+    - Quality vs. speed trade-off was made for scalability and hardware limitations.
+    - For videos longer than 1 minute, please duplicate this Space and adjust the limit in the code.
+    """)
     gr.Markdown("""
+    ---
+    Developed by [@artificialguybr](https://twitter.com/artificialguybr) using open-source tools.
+    Special thanks to Hugging Face for GPU support and [@yeswondwer](https://twitter.com/@yeswondwerr) for the original code.
+    Try our [Video Transcription and Translation](https://huggingface.co/spaces/artificialguybr/VIDEO-TRANSLATION-TRANSCRIPTION) tool!
     """)
 print("Launching Gradio interface...")
 demo.queue()
+demo.launch()