SoniTranslate / app.py
r3gm's picture
Update app.py
2784108
raw
history blame
27.2 kB
#os.system("git clone https://github.com/R3gm/SoniTranslate")
# pip install -r requirements.txt
import numpy as np
import gradio as gr
import whisperx
import torch
from gtts import gTTS
import librosa
import edge_tts
import asyncio
import gc
from pydub import AudioSegment
from tqdm import tqdm
from deep_translator import GoogleTranslator
import os
from soni_translate.audio_segments import create_translated_audio
from soni_translate.text_to_speech import make_voice_gradio
from soni_translate.translate_segments import translate_text
#from soni_translate import test
title = "<center><strong><font size='7'>πŸ“½οΈ SoniTranslate 🈷️</font></strong></center>"
news = """ ## πŸ“– News
πŸ”₯ 2023/07/01: Support (Thanks for [text](https://github.com)).
"""
description = """ ## Translate the audio of a video content from one language to another while preserving synchronization.
This is a demo on Github project πŸ“½οΈ [SoniTranslate](https://github.com/R3gm/SoniTranslate).
πŸ“Ό You can upload a video or provide a video link. The generation is **limited to 10 seconds** to prevent errors with the queue in cpu. If you use a GPU, you won't have any of these limitations.
πŸš€ For **translate a video of any duration** and faster results, you can use the Colab notebook with GPU.
[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://github.com/R3gm/SoniTranslate/blob/main/SoniTranslate_Colab.ipynb)
"""
tutorial = """ # πŸ”° Instructions for use.
1. Upload a video on the first tab or use a video link on the second tab.
2. Choose the language in which you want to translate the video.
3. Specify the number of people speaking in the video and assign each one a text-to-speech voice suitable for the translation language.
4. Press the 'Translate' button to obtain the results.
"""
if not os.path.exists('audio'):
os.makedirs('audio')
if not os.path.exists('audio2/audio'):
os.makedirs('audio2/audio')
# Check GPU
if torch.cuda.is_available():
device = "cuda"
list_compute_type = ['float16', 'float32']
compute_type_default = 'float16'
whisper_model_default = 'large-v1'
else:
device = "cpu"
list_compute_type = ['float32']
compute_type_default = 'float32'
whisper_model_default = 'base'
print('Working in: ', device)
# Download an audio
#url = "https://www.youtube.com/watch?v=Rdi-SNhe2v4"
### INIT
list_tts = ['af-ZA-AdriNeural-Female', 'af-ZA-WillemNeural-Male', 'am-ET-AmehaNeural-Male', 'am-ET-MekdesNeural-Female', 'ar-AE-FatimaNeural-Female', 'ar-AE-HamdanNeural-Male', 'ar-BH-AliNeural-Male', 'ar-BH-LailaNeural-Female', 'ar-DZ-AminaNeural-Female', 'ar-DZ-IsmaelNeural-Male', 'ar-EG-SalmaNeural-Female', 'ar-EG-ShakirNeural-Male', 'ar-IQ-BasselNeural-Male', 'ar-IQ-RanaNeural-Female', 'ar-JO-SanaNeural-Female', 'ar-JO-TaimNeural-Male', 'ar-KW-FahedNeural-Male', 'ar-KW-NouraNeural-Female', 'ar-LB-LaylaNeural-Female', 'ar-LB-RamiNeural-Male', 'ar-LY-ImanNeural-Female', 'ar-LY-OmarNeural-Male', 'ar-MA-JamalNeural-Male', 'ar-MA-MounaNeural-Female', 'ar-OM-AbdullahNeural-Male', 'ar-OM-AyshaNeural-Female', 'ar-QA-AmalNeural-Female', 'ar-QA-MoazNeural-Male', 'ar-SA-HamedNeural-Male', 'ar-SA-ZariyahNeural-Female', 'ar-SY-AmanyNeural-Female', 'ar-SY-LaithNeural-Male', 'ar-TN-HediNeural-Male', 'ar-TN-ReemNeural-Female', 'ar-YE-MaryamNeural-Female', 'ar-YE-SalehNeural-Male', 'az-AZ-BabekNeural-Male', 'az-AZ-BanuNeural-Female', 'bg-BG-BorislavNeural-Male', 'bg-BG-KalinaNeural-Female', 'bn-BD-NabanitaNeural-Female', 'bn-BD-PradeepNeural-Male', 'bn-IN-BashkarNeural-Male', 'bn-IN-TanishaaNeural-Female', 'bs-BA-GoranNeural-Male', 'bs-BA-VesnaNeural-Female', 'ca-ES-EnricNeural-Male', 'ca-ES-JoanaNeural-Female', 'cs-CZ-AntoninNeural-Male', 'cs-CZ-VlastaNeural-Female', 'cy-GB-AledNeural-Male', 'cy-GB-NiaNeural-Female', 'da-DK-ChristelNeural-Female', 'da-DK-JeppeNeural-Male', 'de-AT-IngridNeural-Female', 'de-AT-JonasNeural-Male', 'de-CH-JanNeural-Male', 'de-CH-LeniNeural-Female', 'de-DE-AmalaNeural-Female', 'de-DE-ConradNeural-Male', 'de-DE-KatjaNeural-Female', 'de-DE-KillianNeural-Male', 'el-GR-AthinaNeural-Female', 'el-GR-NestorasNeural-Male', 'en-AU-NatashaNeural-Female', 'en-AU-WilliamNeural-Male', 'en-CA-ClaraNeural-Female', 'en-CA-LiamNeural-Male', 'en-GB-LibbyNeural-Female', 'en-GB-MaisieNeural-Female', 'en-GB-RyanNeural-Male', 'en-GB-SoniaNeural-Female', 'en-GB-ThomasNeural-Male', 'en-HK-SamNeural-Male', 'en-HK-YanNeural-Female', 'en-IE-ConnorNeural-Male', 'en-IE-EmilyNeural-Female', 'en-IN-NeerjaExpressiveNeural-Female', 'en-IN-NeerjaNeural-Female', 'en-IN-PrabhatNeural-Male', 'en-KE-AsiliaNeural-Female', 'en-KE-ChilembaNeural-Male', 'en-NG-AbeoNeural-Male', 'en-NG-EzinneNeural-Female', 'en-NZ-MitchellNeural-Male', 'en-NZ-MollyNeural-Female', 'en-PH-JamesNeural-Male', 'en-PH-RosaNeural-Female', 'en-SG-LunaNeural-Female', 'en-SG-WayneNeural-Male', 'en-TZ-ElimuNeural-Male', 'en-TZ-ImaniNeural-Female', 'en-US-AnaNeural-Female', 'en-US-AriaNeural-Female', 'en-US-ChristopherNeural-Male', 'en-US-EricNeural-Male', 'en-US-GuyNeural-Male', 'en-US-JennyNeural-Female', 'en-US-MichelleNeural-Female', 'en-US-RogerNeural-Male', 'en-US-SteffanNeural-Male', 'en-ZA-LeahNeural-Female', 'en-ZA-LukeNeural-Male', 'es-AR-ElenaNeural-Female', 'es-AR-TomasNeural-Male', 'es-BO-MarceloNeural-Male', 'es-BO-SofiaNeural-Female', 'es-CL-CatalinaNeural-Female', 'es-CL-LorenzoNeural-Male', 'es-CO-GonzaloNeural-Male', 'es-CO-SalomeNeural-Female', 'es-CR-JuanNeural-Male', 'es-CR-MariaNeural-Female', 'es-CU-BelkysNeural-Female', 'es-CU-ManuelNeural-Male', 'es-DO-EmilioNeural-Male', 'es-DO-RamonaNeural-Female', 'es-EC-AndreaNeural-Female', 'es-EC-LuisNeural-Male', 'es-ES-AlvaroNeural-Male', 'es-ES-ElviraNeural-Female', 'es-GQ-JavierNeural-Male', 'es-GQ-TeresaNeural-Female', 'es-GT-AndresNeural-Male', 'es-GT-MartaNeural-Female', 'es-HN-CarlosNeural-Male', 'es-HN-KarlaNeural-Female', 'es-MX-DaliaNeural-Female', 'es-MX-JorgeNeural-Male', 'es-NI-FedericoNeural-Male', 'es-NI-YolandaNeural-Female', 'es-PA-MargaritaNeural-Female', 'es-PA-RobertoNeural-Male', 'es-PE-AlexNeural-Male', 'es-PE-CamilaNeural-Female', 'es-PR-KarinaNeural-Female', 'es-PR-VictorNeural-Male', 'es-PY-MarioNeural-Male', 'es-PY-TaniaNeural-Female', 'es-SV-LorenaNeural-Female', 'es-SV-RodrigoNeural-Male', 'es-US-AlonsoNeural-Male', 'es-US-PalomaNeural-Female', 'es-UY-MateoNeural-Male', 'es-UY-ValentinaNeural-Female', 'es-VE-PaolaNeural-Female', 'es-VE-SebastianNeural-Male', 'et-EE-AnuNeural-Female', 'et-EE-KertNeural-Male', 'fa-IR-DilaraNeural-Female', 'fa-IR-FaridNeural-Male', 'fi-FI-HarriNeural-Male', 'fi-FI-NooraNeural-Female', 'fil-PH-AngeloNeural-Male', 'fil-PH-BlessicaNeural-Female', 'fr-BE-CharlineNeural-Female', 'fr-BE-GerardNeural-Male', 'fr-CA-AntoineNeural-Male', 'fr-CA-JeanNeural-Male', 'fr-CA-SylvieNeural-Female', 'fr-CH-ArianeNeural-Female', 'fr-CH-FabriceNeural-Male', 'fr-FR-DeniseNeural-Female', 'fr-FR-EloiseNeural-Female', 'fr-FR-HenriNeural-Male', 'ga-IE-ColmNeural-Male', 'ga-IE-OrlaNeural-Female', 'gl-ES-RoiNeural-Male', 'gl-ES-SabelaNeural-Female', 'gu-IN-DhwaniNeural-Female', 'gu-IN-NiranjanNeural-Male', 'he-IL-AvriNeural-Male', 'he-IL-HilaNeural-Female', 'hi-IN-MadhurNeural-Male', 'hi-IN-SwaraNeural-Female', 'hr-HR-GabrijelaNeural-Female', 'hr-HR-SreckoNeural-Male', 'hu-HU-NoemiNeural-Female', 'hu-HU-TamasNeural-Male', 'id-ID-ArdiNeural-Male', 'id-ID-GadisNeural-Female', 'is-IS-GudrunNeural-Female', 'is-IS-GunnarNeural-Male', 'it-IT-DiegoNeural-Male', 'it-IT-ElsaNeural-Female', 'it-IT-IsabellaNeural-Female', 'ja-JP-KeitaNeural-Male', 'ja-JP-NanamiNeural-Female', 'jv-ID-DimasNeural-Male', 'jv-ID-SitiNeural-Female', 'ka-GE-EkaNeural-Female', 'ka-GE-GiorgiNeural-Male', 'kk-KZ-AigulNeural-Female', 'kk-KZ-DauletNeural-Male', 'km-KH-PisethNeural-Male', 'km-KH-SreymomNeural-Female', 'kn-IN-GaganNeural-Male', 'kn-IN-SapnaNeural-Female', 'ko-KR-InJoonNeural-Male', 'ko-KR-SunHiNeural-Female', 'lo-LA-ChanthavongNeural-Male', 'lo-LA-KeomanyNeural-Female', 'lt-LT-LeonasNeural-Male', 'lt-LT-OnaNeural-Female', 'lv-LV-EveritaNeural-Female', 'lv-LV-NilsNeural-Male', 'mk-MK-AleksandarNeural-Male', 'mk-MK-MarijaNeural-Female', 'ml-IN-MidhunNeural-Male', 'ml-IN-SobhanaNeural-Female', 'mn-MN-BataaNeural-Male', 'mn-MN-YesuiNeural-Female', 'mr-IN-AarohiNeural-Female', 'mr-IN-ManoharNeural-Male', 'ms-MY-OsmanNeural-Male', 'ms-MY-YasminNeural-Female', 'mt-MT-GraceNeural-Female', 'mt-MT-JosephNeural-Male', 'my-MM-NilarNeural-Female', 'my-MM-ThihaNeural-Male', 'nb-NO-FinnNeural-Male', 'nb-NO-PernilleNeural-Female', 'ne-NP-HemkalaNeural-Female', 'ne-NP-SagarNeural-Male', 'nl-BE-ArnaudNeural-Male', 'nl-BE-DenaNeural-Female', 'nl-NL-ColetteNeural-Female', 'nl-NL-FennaNeural-Female', 'nl-NL-MaartenNeural-Male', 'pl-PL-MarekNeural-Male', 'pl-PL-ZofiaNeural-Female', 'ps-AF-GulNawazNeural-Male', 'ps-AF-LatifaNeural-Female', 'pt-BR-AntonioNeural-Male', 'pt-BR-FranciscaNeural-Female', 'pt-PT-DuarteNeural-Male', 'pt-PT-RaquelNeural-Female', 'ro-RO-AlinaNeural-Female', 'ro-RO-EmilNeural-Male', 'ru-RU-DmitryNeural-Male', 'ru-RU-SvetlanaNeural-Female', 'si-LK-SameeraNeural-Male', 'si-LK-ThiliniNeural-Female', 'sk-SK-LukasNeural-Male', 'sk-SK-ViktoriaNeural-Female', 'sl-SI-PetraNeural-Female', 'sl-SI-RokNeural-Male', 'so-SO-MuuseNeural-Male', 'so-SO-UbaxNeural-Female', 'sq-AL-AnilaNeural-Female', 'sq-AL-IlirNeural-Male', 'sr-RS-NicholasNeural-Male', 'sr-RS-SophieNeural-Female', 'su-ID-JajangNeural-Male', 'su-ID-TutiNeural-Female', 'sv-SE-MattiasNeural-Male', 'sv-SE-SofieNeural-Female', 'sw-KE-RafikiNeural-Male', 'sw-KE-ZuriNeural-Female', 'sw-TZ-DaudiNeural-Male', 'sw-TZ-RehemaNeural-Female', 'ta-IN-PallaviNeural-Female', 'ta-IN-ValluvarNeural-Male', 'ta-LK-KumarNeural-Male', 'ta-LK-SaranyaNeural-Female', 'ta-MY-KaniNeural-Female', 'ta-MY-SuryaNeural-Male', 'ta-SG-AnbuNeural-Male', 'ta-SG-VenbaNeural-Female', 'te-IN-MohanNeural-Male', 'te-IN-ShrutiNeural-Female', 'th-TH-NiwatNeural-Male', 'th-TH-PremwadeeNeural-Female', 'tr-TR-AhmetNeural-Male', 'tr-TR-EmelNeural-Female', 'uk-UA-OstapNeural-Male', 'uk-UA-PolinaNeural-Female', 'ur-IN-GulNeural-Female', 'ur-IN-SalmanNeural-Male', 'ur-PK-AsadNeural-Male', 'ur-PK-UzmaNeural-Female', 'uz-UZ-MadinaNeural-Female', 'uz-UZ-SardorNeural-Male', 'vi-VN-HoaiMyNeural-Female', 'vi-VN-NamMinhNeural-Male', 'zh-CN-XiaoxiaoNeural-Female', 'zh-CN-XiaoyiNeural-Female', 'zh-CN-YunjianNeural-Male', 'zh-CN-YunxiNeural-Male', 'zh-CN-YunxiaNeural-Male', 'zh-CN-YunyangNeural-Male', 'zh-CN-liaoning-XiaobeiNeural-Female', 'zh-CN-shaanxi-XiaoniNeural-Female']
def translate_from_video(video, WHISPER_MODEL_SIZE, batch_size, compute_type,
TRANSLATE_AUDIO_TO, min_speakers, max_speakers,
tts_voice00, tts_voice01,tts_voice02,tts_voice03,tts_voice04,tts_voice05):
YOUR_HF_TOKEN = os.getenv("My_hf_token")
OutputFile = 'Video.mp4'
audio_wav = "audio.wav"
Output_name_file = "audio_dub_solo.wav"
mix_audio = "audio_mix.mp3"
video_output = "diar_output.mp4"
os.system(f"rm {Output_name_file}")
os.system("rm Video.mp4")
#os.system("rm diar_output.mp4")
os.system("rm audio.wav")
if os.path.exists(video):
print(f"### Start Video ###")
if device == 'cpu':
# max 1 minute in cpu
print('10 s. Limited for CPU ')
os.system(f"ffmpeg -y -i {video} -ss 00:00:20 -t 00:00:10 -c:v libx264 -c:a aac -strict experimental Video.mp4")
else:
os.system(f"ffmpeg -y -i {video} -c:v libx264 -c:a aac -strict experimental Video.mp4")
os.system("ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav")
else:
print(f"### Start {video} ###")
if device == 'cpu':
# max 1 minute in cpu
print('10 s. Limited for CPU ')
#https://github.com/yt-dlp/yt-dlp/issues/2220
mp4_ = f'yt-dlp -f "mp4" --downloader ffmpeg --downloader-args "ffmpeg_i: -ss 00:00:20 -t 00:00:10" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
wav_ = "ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 1 audio.wav"
else:
mp4_ = f'yt-dlp -f "mp4" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
wav_ = f'python -m yt_dlp --output {audio_wav} --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --extract-audio --audio-format wav {video}'
os.system(mp4_)
os.system(wav_)
print("Set file complete.")
# 1. Transcribe with original whisper (batched)
model = whisperx.load_model(
WHISPER_MODEL_SIZE,
device,
compute_type=compute_type
)
audio = whisperx.load_audio(audio_wav)
result = model.transcribe(audio, batch_size=batch_size)
gc.collect(); torch.cuda.empty_cache(); del model
print("Transcript complete")
# 2. Align whisper output
model_a, metadata = whisperx.load_align_model(
language_code=result["language"],
device=device
)
result = whisperx.align(
result["segments"],
model_a,
metadata,
audio,
device,
return_char_alignments=True,
)
gc.collect(); torch.cuda.empty_cache(); del model_a
print("Align complete")
# 3. Assign speaker labels
diarize_model = whisperx.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device)
diarize_segments = diarize_model(
audio_wav,
min_speakers=min_speakers,
max_speakers=max_speakers)
result_diarize = whisperx.assign_word_speakers(diarize_segments, result)
gc.collect(); torch.cuda.empty_cache(); del diarize_model
print("Diarize complete")
result_diarize['segments'] = translate_text(result_diarize['segments'], TRANSLATE_AUDIO_TO)
print("Translation complete")
audio_files = []
# Mapping speakers to voice variables
speaker_to_voice = {
'SPEAKER_00': tts_voice00,
'SPEAKER_01': tts_voice01,
'SPEAKER_02': tts_voice02,
'SPEAKER_03': tts_voice03,
'SPEAKER_04': tts_voice04,
'SPEAKER_05': tts_voice05
}
for segment in result_diarize['segments']:
text = segment['text']
start = segment['start']
end = segment['end']
try:
speaker = segment['speaker']
except KeyError:
segment['speaker'] = "SPEAKER_99"
speaker = segment['speaker']
print("NO SPEAKER DETECT IN SEGMENT")
# make the tts audio
filename = f"audio/{start}.ogg"
if speaker in speaker_to_voice and speaker_to_voice[speaker] != 'None':
make_voice_gradio(text, speaker_to_voice[speaker], filename)
elif speaker == "SPEAKER_99":
try:
tts = gTTS(text, lang=TRANSLATE_AUDIO_TO)
tts.save(filename)
print('Using GTTS')
except:
tts = gTTS('a', lang=TRANSLATE_AUDIO_TO)
tts.save(filename)
print('ERROR AUDIO GTTS')
# duration
duration_true = end - start
duration_tts = librosa.get_duration(filename=filename)
# porcentaje
porcentaje = duration_tts / duration_true
if porcentaje > 2.1:
porcentaje = 2.1
elif porcentaje <= 1.2 and porcentaje >= 0.8:
porcentaje = 1.0
elif porcentaje <= 0.79:
porcentaje = 0.8
# Smoth and round
porcentaje = round(porcentaje+0.0, 1)
# apply aceleration or opposite to the audio file in audio2 folder
os.system(f"ffmpeg -y -loglevel panic -i {filename} -filter:a atempo={porcentaje} audio2/{filename}")
duration_create = librosa.get_duration(filename=f"audio2/{filename}")
audio_files.append(filename)
# replace files with the accelerates
os.system("mv -f audio2/audio/*.ogg audio/")
os.system(f"rm {Output_name_file}")
create_translated_audio(result_diarize, audio_files, Output_name_file)
os.system("rm audio_dub_stereo.wav")
os.system("ffmpeg -i audio_dub_solo.wav -ac 1 audio_dub_stereo.wav")
#os.system(f"ffmpeg -i Video.mp4 -i {Output_name_file} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output}")
os.system(f"rm {mix_audio}")
#os.system(f'''ffmpeg -i {audio_wav} -i audio_dub_stereo.wav -filter_complex "[1:a]asplit=2[sc][mix];[0:a][sc]sidechaincompress=threshold=0.003:ratio=20[bg]; [bg][mix]amerge[final]" -map [final] {mix_audio}''')
#os.system(f'ffmpeg -y -i {audio_wav} -i audio_dub_stereo.wav -filter_complex "[0:0][1:0] amix=inputs=2:duration=longest" -c:a libmp3lame {mix_audio}')
os.system(f'ffmpeg -y -i audio.wav -i audio_dub_stereo.wav -filter_complex "[0:0]volume=0.15[a];[1:0]volume=1.90[b];[a][b]amix=inputs=2:duration=longest" -c:a libmp3lame {mix_audio}')
os.system(f"rm {video_output}")
os.system(f"ffmpeg -i {OutputFile} -i {mix_audio} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output}")
return video_output
import sys
class Logger:
def __init__(self, filename):
self.terminal = sys.stdout
self.log = open(filename, "w")
def write(self, message):
self.terminal.write(message)
self.log.write(message)
def flush(self):
self.terminal.flush()
self.log.flush()
def isatty(self):
return False
sys.stdout = Logger("output.log")
def read_logs():
sys.stdout.flush()
with open("output.log", "r") as f:
return f.read()
with gr.Blocks() as demo:
gr.Markdown(title)
gr.Markdown(description)
gr.Markdown(tutorial)
with gr.Tab("Translate audio from video"):
with gr.Row():
with gr.Column():
video_input = gr.Video() # height=300,width=300
gr.Markdown("Select the target language, and make sure to select the language corresponding to the speakers of the target language to avoid errors in the process.")
TRANSLATE_AUDIO_TO = gr.inputs.Dropdown(['en', 'fr', 'de', 'es', 'it', 'ja', 'zh', 'nl', 'uk', 'pt'], default='en',label = 'Translate audio to')
gr.Markdown("Select how many people are speaking in the video.")
min_speakers = gr.inputs.Slider(1, 6, default=1, label="min_speakers", step=1)
max_speakers = gr.inputs.Slider(1, 6, default=2, label="max_speakers",step=1)
gr.Markdown("Select the voice you want for each speaker.")
tts_voice00 = gr.inputs.Dropdown(list_tts, default='en-AU-WilliamNeural-Male', label = 'TTS Speaker 1')
tts_voice01 = gr.inputs.Dropdown(list_tts, default='en-CA-ClaraNeural-Female', label = 'TTS Speaker 2')
tts_voice02 = gr.inputs.Dropdown(list_tts, default='en-GB-ThomasNeural-Male', label = 'TTS Speaker 3')
tts_voice03 = gr.inputs.Dropdown(list_tts, default='en-GB-SoniaNeural-Female', label = 'TTS Speaker 4')
tts_voice04 = gr.inputs.Dropdown(list_tts, default='en-NZ-MitchellNeural-Male', label = 'TTS Speaker 5')
tts_voice05 = gr.inputs.Dropdown(list_tts, default='en-GB-MaisieNeural-Female', label = 'TTS Speaker 6')
gr.Markdown("Default configuration of Whisper.")
WHISPER_MODEL_SIZE = gr.inputs.Dropdown(['tiny', 'base', 'small', 'medium', 'large-v1', 'large-v2'], default=whisper_model_default, label="Whisper model")
batch_size = gr.inputs.Slider(1, 32, default=16, label="Batch size", step=1)
compute_type = gr.inputs.Dropdown(list_compute_type, default=compute_type_default, label="Compute type")
with gr.Column(variant='compact'):
with gr.Row():
video_button = gr.Button("Translate audio of video", )
with gr.Row():
video_output = gr.Video()
gr.Examples(
examples=[
[
"./assets/Video_subtitled.mp4",
"base",
16,
"float32",
"en",
1,
2,
'en-AU-WilliamNeural-Male',
'en-CA-ClaraNeural-Female',
'en-GB-ThomasNeural-Male',
'en-GB-SoniaNeural-Female',
'en-NZ-MitchellNeural-Male',
'en-GB-MaisieNeural-Female',
],
],
fn=translate_from_video,
inputs=[
video_input,
WHISPER_MODEL_SIZE,
batch_size,
compute_type,
TRANSLATE_AUDIO_TO,
min_speakers,
max_speakers,
tts_voice00,
tts_voice01,
tts_voice02,
tts_voice03,
tts_voice04,
tts_voice05,
],
outputs=[video_output],
#cache_examples=True,
)
with gr.Tab("Translate audio from video link"):
with gr.Row():
with gr.Column():
link_input = gr.Textbox(label="Media link. Example: www.youtube.com/watch?v=g_9rPvbENUw", placeholder="URL goes here...")
#filename = gr.Textbox(label="File name", placeholder="best-vid")
gr.Markdown("Select the target language, and make sure to select the language corresponding to the speakers of the target language to avoid errors in the process.")
bTRANSLATE_AUDIO_TO = gr.inputs.Dropdown(['en', 'fr', 'de', 'es', 'it', 'ja', 'zh', 'nl', 'uk', 'pt'], default='en',label = 'Translate audio to')
gr.Markdown("Select how many people are speaking in the video.")
bmin_speakers = gr.inputs.Slider(1, 6, default=1, label="min_speakers", step=1)
bmax_speakers = gr.inputs.Slider(1, 6, default=2, label="max_speakers",step=1)
gr.Markdown("Select the voice you want for each speaker.")
btts_voice00 = gr.inputs.Dropdown(list_tts, default='en-AU-WilliamNeural-Male', label = 'TTS Speaker 1')
btts_voice01 = gr.inputs.Dropdown(list_tts, default='en-CA-ClaraNeural-Female', label = 'TTS Speaker 2')
btts_voice02 = gr.inputs.Dropdown(list_tts, default='en-GB-ThomasNeural-Male', label = 'TTS Speaker 3')
btts_voice03 = gr.inputs.Dropdown(list_tts, default='en-GB-SoniaNeural-Female', label = 'TTS Speaker 4')
btts_voice04 = gr.inputs.Dropdown(list_tts, default='en-NZ-MitchellNeural-Male', label = 'TTS Speaker 5')
btts_voice05 = gr.inputs.Dropdown(list_tts, default='en-GB-MaisieNeural-Female', label = 'TTS Speaker 6')
gr.Markdown("Default configuration of Whisper.")
bWHISPER_MODEL_SIZE = gr.inputs.Dropdown(['tiny', 'base', 'small', 'medium', 'large-v1', 'large-v2'], default=whisper_model_default, label="Whisper model")
bbatch_size = gr.inputs.Slider(1, 32, default=16, label="Batch size", step=1)
bcompute_type = gr.inputs.Dropdown(list_compute_type, default=compute_type_default, label="Compute type")
# text_button = gr.Button("Translate audio of video")
# link_output = gr.Video() #gr.outputs.File(label="Download!")
with gr.Column(variant='compact'):
with gr.Row():
text_button = gr.Button("Translate audio of video")
with gr.Row():
link_output = gr.Video() #gr.outputs.File(label="Download!") # gr.Video()
gr.Examples(
examples=[
[
"https://www.youtube.com/watch?v=5ZeHtRKHl7Y",
"base",
16,
"float32",
"en",
1,
2,
'en-CA-ClaraNeural-Female',
'en-AU-WilliamNeural-Male',
'en-GB-ThomasNeural-Male',
'en-GB-SoniaNeural-Female',
'en-NZ-MitchellNeural-Male',
'en-GB-MaisieNeural-Female',
],
],
fn=translate_from_video,
inputs=[
link_input,
bWHISPER_MODEL_SIZE,
bbatch_size,
bcompute_type,
bTRANSLATE_AUDIO_TO,
bmin_speakers,
bmax_speakers,
btts_voice00,
btts_voice01,
btts_voice02,
btts_voice03,
btts_voice04,
btts_voice05,
],
outputs=[video_output],
#cache_examples=True,
)
with gr.Accordion("Logs"):
logs = gr.Textbox()
demo.load(read_logs, None, logs, every=1)
# run
video_button.click(translate_from_video, inputs=[
video_input,
WHISPER_MODEL_SIZE,
batch_size,
compute_type,
TRANSLATE_AUDIO_TO,
min_speakers,
max_speakers,
tts_voice00,
tts_voice01,
tts_voice02,
tts_voice03,
tts_voice04,
tts_voice05,], outputs=video_output)
text_button.click(translate_from_video, inputs=[
link_input,
bWHISPER_MODEL_SIZE,
bbatch_size,
bcompute_type,
bTRANSLATE_AUDIO_TO,
bmin_speakers,
bmax_speakers,
btts_voice00,
btts_voice01,
btts_voice02,
btts_voice03,
btts_voice04,
btts_voice05,], outputs=link_output)
demo.launch(enable_queue=True)