import tempfile import gradio as gr import subprocess import os, stat from googletrans import Translator from TTS.api import TTS import ffmpeg import whisper from scipy.signal import wiener import soundfile as sf from pydub import AudioSegment import numpy as np import librosa from zipfile import ZipFile import shlex import librosa import numpy as np import cv2 import torch import torchvision from tqdm import tqdm from numba import jit os.environ["COQUI_TOS_AGREED"] = "1" ZipFile("ffmpeg.zip").extractall() st = os.stat('ffmpeg') os.chmod('ffmpeg', st.st_mode | stat.S_IEXEC) def process_video(video, high_quality, target_language): output_filename = "resized_video.mp4" if high_quality: ffmpeg.input(video).output(output_filename, vf='scale=-1:720').run() video_path = output_filename else: video_path = video # Debugging Step 1: Check if video_path exists if not os.path.exists(video_path): return f"Error: {video_path} does not exist." ffmpeg.input(video_path).output('output_audio.wav', acodec='pcm_s24le', ar=48000, map='a').run() y, sr = sf.read("output_audio.wav") y = y.astype(np.float32) y_denoised = wiener(y) sf.write("output_audio_denoised.wav", y_denoised, sr) sound = AudioSegment.from_file("output_audio_denoised.wav", format="wav") sound = sound.apply_gain(0) # Reduce gain by 5 dB sound = sound.low_pass_filter(3000).high_pass_filter(100) sound.export("output_audio_processed.wav", format="wav") shell_command = f"ffmpeg -y -i output_audio_processed.wav -af lowpass=3000,highpass=100 output_audio_final.wav".split(" ") subprocess.run([item for item in shell_command], capture_output=False, text=True, check=True) model = whisper.load_model("base") result = model.transcribe("output_audio_final.wav") whisper_text = result["text"] whisper_language = result['language'] print(whisper_text) language_mapping = {'English': 'en', 'Spanish': 'es', 'French': 'fr', 'German': 'de', 'Italian': 'it', 'Portuguese': 'pt', 'Polish': 'pl', 'Turkish': 'tr', 'Russian': 'ru', 'Dutch': 'nl', 'Czech': 'cs', 'Arabic': 'ar', 'Chinese (Simplified)': 'zh-cn'} target_language_code = language_mapping[target_language] translator = Translator() try: translated_text = translator.translate(whisper_text, src=whisper_language, dest=target_language_code).text print(translated_text) except AttributeError as e: print("Failed to translate text. Likely an issue with token extraction in the Google Translate API.") translated_text = "Translation failed due to API issue." tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1") tts.to('cuda') # Replacing deprecated gpu=True tts.tts_to_file(translated_text, speaker_wav='output_audio_final.wav', file_path="output_synth.wav", language=target_language_code) pad_top = 0 pad_bottom = 15 pad_left = 0 pad_right = 0 rescaleFactor = 1 # Debugging Step 2: Remove quotes around the video path video_path_fix = video_path cmd = f"python Wav2Lip/inference.py --checkpoint_path '/Wav2Lip/checkpoints/wav2lip_gan.pth' --face {shlex.quote(video_path_fix)} --audio 'output_synth.wav' --pads {pad_top} {pad_bottom} {pad_left} {pad_right} --resize_factor {rescaleFactor} --nosmooth --outfile 'output_video.mp4'" subprocess.run(cmd, shell=True) # Debugging Step 3: Check if output video exists if not os.path.exists("output_video.mp4"): return "Error: output_video.mp4 was not generated." return "output_video.mp4" iface = gr.Interface( fn=process_video, inputs=[ gr.Video(), gr.inputs.Checkbox(label="High Quality"), gr.inputs.Dropdown(choices=["English", "Spanish", "French", "German", "Italian", "Portuguese", "Polish", "Turkish", "Russian", "Dutch", "Czech", "Arabic", "Chinese (Simplified)"], label="Target Language for Dubbing") ], outputs=gr.outputs.File(), live=False ) iface.launch()