import numpy as np import gradio as gr import whisperx import torch from gtts import gTTS import librosa import edge_tts import gc from pydub import AudioSegment from tqdm import tqdm from deep_translator import GoogleTranslator import os from soni_translate.audio_segments import create_translated_audio from soni_translate.text_to_speech import make_voice from soni_translate.translate_segments import translate_text import time def translate_from_video( video, YOUR_HF_TOKEN, preview=False, WHISPER_MODEL_SIZE="large-v1", batch_size=16, compute_type="float16", SOURCE_LANGUAGE= "Automatic detection", TRANSLATE_AUDIO_TO="en", min_speakers=1, max_speakers=2, tts_voice00="en-AU-WilliamNeural-Male", tts_voice01="en-CA-ClaraNeural-Female", tts_voice02="en-GB-ThomasNeural-Male", tts_voice03="en-GB-SoniaNeural-Female", tts_voice04="en-NZ-MitchellNeural-Male", tts_voice05="en-GB-MaisieNeural-Female", video_output="video_dub.mp4" ): if YOUR_HF_TOKEN == "" or YOUR_HF_TOKEN == None: YOUR_HF_TOKEN = os.getenv("YOUR_HF_TOKEN") if not os.path.exists('audio'): os.makedirs('audio') if not os.path.exists('audio2/audio'): os.makedirs('audio2/audio') # Check GPU device = "cuda" if torch.cuda.is_available() else "cpu" compute_type = "float32" if device == "cpu" else compute_type OutputFile = 'Video.mp4' audio_wav = "audio.wav" Output_name_file = "audio_dub_solo.ogg" mix_audio = "audio_mix.mp3" os.system("rm Video.mp4") os.system("rm audio.webm") os.system("rm audio.wav") if os.path.exists(video): if preview: print('Creating preview video, 10 seconds') os.system(f'ffmpeg -y -i "{video}" -ss 00:00:20 -t 00:00:10 -c:v libx264 -c:a aac -strict experimental Video.mp4') else: os.system(f'ffmpeg -y -i "{video}" -c:v libx264 -c:a aac -strict experimental Video.mp4') os.system("ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav") else: if preview: print('Creating preview from link, 10 seconds') #https://github.com/yt-dlp/yt-dlp/issues/2220 mp4_ = f'yt-dlp -f "mp4" --downloader ffmpeg --downloader-args "ffmpeg_i: -ss 00:00:20 -t 00:00:10" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}' wav_ = "ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav" os.system(mp4_) os.system(wav_) else: mp4_ = f'yt-dlp -f "mp4" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}' wav_ = f'python -m yt_dlp --output {audio_wav} --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --extract-audio --audio-format wav {video}' os.system(wav_) for i in range (120): time.sleep(1) print('process audio') if os.path.exists(audio_wav) and not os.path.exists('audio.webm'): time.sleep(1) os.system(mp4_) break if i == 119: print('Error donwloading the audio') return print("Set file complete.") SOURCE_LANGUAGE = None if SOURCE_LANGUAGE == 'Automatic detection' else SOURCE_LANGUAGE # 1. Transcribe with original whisper (batched) model = whisperx.load_model( WHISPER_MODEL_SIZE, device, compute_type=compute_type, language= SOURCE_LANGUAGE, ) audio = whisperx.load_audio(audio_wav) result = model.transcribe(audio, batch_size=batch_size) gc.collect(); torch.cuda.empty_cache(); del model print("Transcript complete") # 2. Align whisper output model_a, metadata = whisperx.load_align_model( language_code=result["language"], device=device ) result = whisperx.align( result["segments"], model_a, metadata, audio, device, return_char_alignments=True, ) gc.collect(); torch.cuda.empty_cache(); del model_a print("Align complete") if result['segments'] == []: print('No active speech found in audio') return # 3. Assign speaker labels diarize_model = whisperx.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device) diarize_segments = diarize_model( audio_wav, min_speakers=min_speakers, max_speakers=max_speakers) result_diarize = whisperx.assign_word_speakers(diarize_segments, result) gc.collect(); torch.cuda.empty_cache(); del diarize_model print("Diarize complete") result_diarize['segments'] = translate_text(result_diarize['segments'], TRANSLATE_AUDIO_TO) print("Translation complete") audio_files = [] # Mapping speakers to voice variables speaker_to_voice = { 'SPEAKER_00': tts_voice00, 'SPEAKER_01': tts_voice01, 'SPEAKER_02': tts_voice02, 'SPEAKER_03': tts_voice03, 'SPEAKER_04': tts_voice04, 'SPEAKER_05': tts_voice05 } for segment in tqdm(result_diarize['segments']): text = segment['text'] start = segment['start'] end = segment['end'] try: speaker = segment['speaker'] except KeyError: segment['speaker'] = "SPEAKER_99" speaker = segment['speaker'] print("NO SPEAKER DETECT IN SEGMENT") # make the tts audio filename = f"audio/{start}.ogg" if speaker in speaker_to_voice and speaker_to_voice[speaker] != 'None': make_voice(text, speaker_to_voice[speaker], filename, TRANSLATE_AUDIO_TO) elif speaker == "SPEAKER_99": try: tts = gTTS(text, lang=TRANSLATE_AUDIO_TO) tts.save(filename) print('Using GTTS') except: tts = gTTS('a', lang=TRANSLATE_AUDIO_TO) tts.save(filename) print('Error: Audio will be replaced.') # duration duration_true = end - start duration_tts = librosa.get_duration(filename=filename) # porcentaje porcentaje = duration_tts / duration_true if porcentaje > 2.1: porcentaje = 2.1 elif porcentaje <= 1.2 and porcentaje >= 0.8: porcentaje = 1.0 elif porcentaje <= 0.79: porcentaje = 0.8 # Smoth and round porcentaje = round(porcentaje+0.0, 1) # apply aceleration or opposite to the audio file in audio2 folder os.system(f"ffmpeg -y -loglevel panic -i {filename} -filter:a atempo={porcentaje} audio2/{filename}") duration_create = librosa.get_duration(filename=f"audio2/{filename}") audio_files.append(filename) # replace files with the accelerates os.system("mv -f audio2/audio/*.ogg audio/") os.system(f"rm {Output_name_file}") create_translated_audio(result_diarize, audio_files, Output_name_file) os.system(f"rm {mix_audio}") os.system(f'ffmpeg -i {audio_wav} -i {Output_name_file} -filter_complex "[1:a]asplit=2[sc][mix];[0:a][sc]sidechaincompress=threshold=0.003:ratio=20[bg]; [bg][mix]amerge[final]" -map [final] {mix_audio}') os.system(f"rm {video_output}") os.system(f"ffmpeg -i {OutputFile} -i {mix_audio} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output}") return video_output