Spaces:

r3gm
/

SoniTranslate_translate_audio_of_a_video_content

Running on Zero

App Files Files Community

SoniTranslate_translate_audio_of_a_video_content / soni_translate /video_dubbing.py

r3gm

Upload 4 files

fc97911 11 months ago

raw history blame

No virus

7.68 kB

	import numpy as np
	import gradio as gr
	import whisperx
	import torch
	from gtts import gTTS
	import librosa
	import edge_tts
	import gc
	from pydub import AudioSegment
	from tqdm import tqdm
	from deep_translator import GoogleTranslator
	import os
	from soni_translate.audio_segments import create_translated_audio
	from soni_translate.text_to_speech import make_voice
	from soni_translate.translate_segments import translate_text
	import time

	def translate_from_video(
	video,
	YOUR_HF_TOKEN,
	preview=False,
	WHISPER_MODEL_SIZE="large-v1",
	batch_size=16,
	compute_type="float16",
	SOURCE_LANGUAGE= "Automatic detection",
	TRANSLATE_AUDIO_TO="en",
	min_speakers=1,
	max_speakers=2,
	tts_voice00="en-AU-WilliamNeural-Male",
	tts_voice01="en-CA-ClaraNeural-Female",
	tts_voice02="en-GB-ThomasNeural-Male",
	tts_voice03="en-GB-SoniaNeural-Female",
	tts_voice04="en-NZ-MitchellNeural-Male",
	tts_voice05="en-GB-MaisieNeural-Female",
	video_output="video_dub.mp4"
	):

	if YOUR_HF_TOKEN == "" or YOUR_HF_TOKEN == None:
	YOUR_HF_TOKEN = os.getenv("YOUR_HF_TOKEN")

	if not os.path.exists('audio'):
	os.makedirs('audio')

	if not os.path.exists('audio2/audio'):
	os.makedirs('audio2/audio')

	# Check GPU
	device = "cuda" if torch.cuda.is_available() else "cpu"
	compute_type = "float32" if device == "cpu" else compute_type

	OutputFile = 'Video.mp4'
	audio_wav = "audio.wav"
	Output_name_file = "audio_dub_solo.ogg"
	mix_audio = "audio_mix.mp3"

	os.system("rm Video.mp4")
	os.system("rm audio.webm")
	os.system("rm audio.wav")

	if os.path.exists(video):
	if preview:
	print('Creating preview video, 10 seconds')
	os.system(f'ffmpeg -y -i "{video}" -ss 00:00:20 -t 00:00:10 -c:v libx264 -c:a aac -strict experimental Video.mp4')
	else:
	os.system(f'ffmpeg -y -i "{video}" -c:v libx264 -c:a aac -strict experimental Video.mp4')

	os.system("ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav")
	else:
	if preview:
	print('Creating preview from link, 10 seconds')
	#https://github.com/yt-dlp/yt-dlp/issues/2220
	mp4_ = f'yt-dlp -f "mp4" --downloader ffmpeg --downloader-args "ffmpeg_i: -ss 00:00:20 -t 00:00:10" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
	wav_ = "ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav"
	os.system(mp4_)
	os.system(wav_)
	else:
	mp4_ = f'yt-dlp -f "mp4" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
	wav_ = f'python -m yt_dlp --output {audio_wav} --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --extract-audio --audio-format wav {video}'

	os.system(wav_)

	for i in range (120):
	time.sleep(1)
	print('process audio')
	if os.path.exists(audio_wav) and not os.path.exists('audio.webm'):
	time.sleep(1)
	os.system(mp4_)
	break
	if i == 119:
	print('Error donwloading the audio')
	return

	print("Set file complete.")

	SOURCE_LANGUAGE = None if SOURCE_LANGUAGE == 'Automatic detection' else SOURCE_LANGUAGE

	# 1. Transcribe with original whisper (batched)
	model = whisperx.load_model(
	WHISPER_MODEL_SIZE,
	device,
	compute_type=compute_type,
	language= SOURCE_LANGUAGE,
	)
	audio = whisperx.load_audio(audio_wav)
	result = model.transcribe(audio, batch_size=batch_size)
	gc.collect(); torch.cuda.empty_cache(); del model
	print("Transcript complete")

	# 2. Align whisper output
	model_a, metadata = whisperx.load_align_model(
	language_code=result["language"],
	device=device
	)
	result = whisperx.align(
	result["segments"],
	model_a,
	metadata,
	audio,
	device,
	return_char_alignments=True,
	)
	gc.collect(); torch.cuda.empty_cache(); del model_a
	print("Align complete")

	if result['segments'] == []:
	print('No active speech found in audio')
	return

	# 3. Assign speaker labels
	diarize_model = whisperx.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device)
	diarize_segments = diarize_model(
	audio_wav,
	min_speakers=min_speakers,
	max_speakers=max_speakers)
	result_diarize = whisperx.assign_word_speakers(diarize_segments, result)
	gc.collect(); torch.cuda.empty_cache(); del diarize_model
	print("Diarize complete")

	result_diarize['segments'] = translate_text(result_diarize['segments'], TRANSLATE_AUDIO_TO)
	print("Translation complete")

	audio_files = []

	# Mapping speakers to voice variables
	speaker_to_voice = {
	'SPEAKER_00': tts_voice00,
	'SPEAKER_01': tts_voice01,
	'SPEAKER_02': tts_voice02,
	'SPEAKER_03': tts_voice03,
	'SPEAKER_04': tts_voice04,
	'SPEAKER_05': tts_voice05
	}

	for segment in tqdm(result_diarize['segments']):

	text = segment['text']
	start = segment['start']
	end = segment['end']

	try:
	speaker = segment['speaker']
	except KeyError:
	segment['speaker'] = "SPEAKER_99"
	speaker = segment['speaker']
	print("NO SPEAKER DETECT IN SEGMENT")

	# make the tts audio
	filename = f"audio/{start}.ogg"

	if speaker in speaker_to_voice and speaker_to_voice[speaker] != 'None':
	make_voice(text, speaker_to_voice[speaker], filename, TRANSLATE_AUDIO_TO)
	elif speaker == "SPEAKER_99":
	try:
	tts = gTTS(text, lang=TRANSLATE_AUDIO_TO)
	tts.save(filename)
	print('Using GTTS')
	except:
	tts = gTTS('a', lang=TRANSLATE_AUDIO_TO)
	tts.save(filename)
	print('Error: Audio will be replaced.')

	# duration
	duration_true = end - start
	duration_tts = librosa.get_duration(filename=filename)

	# porcentaje
	porcentaje = duration_tts / duration_true

	if porcentaje > 2.1:
	porcentaje = 2.1
	elif porcentaje <= 1.2 and porcentaje >= 0.8:
	porcentaje = 1.0
	elif porcentaje <= 0.79:
	porcentaje = 0.8

	# Smoth and round
	porcentaje = round(porcentaje+0.0, 1)

	# apply aceleration or opposite to the audio file in audio2 folder
	os.system(f"ffmpeg -y -loglevel panic -i {filename} -filter:a atempo={porcentaje} audio2/{filename}")

	duration_create = librosa.get_duration(filename=f"audio2/{filename}")
	audio_files.append(filename)

	# replace files with the accelerates
	os.system("mv -f audio2/audio/*.ogg audio/")

	os.system(f"rm {Output_name_file}")
	create_translated_audio(result_diarize, audio_files, Output_name_file)

	os.system(f"rm {mix_audio}")
	os.system(f'ffmpeg -i {audio_wav} -i {Output_name_file} -filter_complex "[1:a]asplit=2[sc][mix];[0:a][sc]sidechaincompress=threshold=0.003:ratio=20[bg]; [bg][mix]amerge[final]" -map [final] {mix_audio}')

	os.system(f"rm {video_output}")
	os.system(f"ffmpeg -i {OutputFile} -i {mix_audio} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output}")

	return video_output