Spaces:
Running
Running
add new ui and options for demo
Browse files
app.py
CHANGED
@@ -1,6 +1,4 @@
|
|
1 |
-
|
2 |
-
#os.system("git clone https://github.com/R3gm/SoniTranslate")
|
3 |
-
# pip install -r requirements.txt
|
4 |
import numpy as np
|
5 |
import gradio as gr
|
6 |
import whisperx
|
@@ -17,45 +15,41 @@ import os
|
|
17 |
from soni_translate.audio_segments import create_translated_audio
|
18 |
from soni_translate.text_to_speech import make_voice_gradio
|
19 |
from soni_translate.translate_segments import translate_text
|
20 |
-
#from soni_translate import test
|
21 |
|
22 |
title = "<center><strong><font size='7'>๐ฝ๏ธ SoniTranslate ๐ท๏ธ</font></strong></center>"
|
23 |
|
24 |
news = """ ## ๐ News
|
25 |
-
๐ฅ 2023/07/
|
26 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
-
description = """ ## Translate the audio of a video content from one language to another while preserving synchronization.
|
29 |
|
30 |
|
31 |
-
|
32 |
-
|
33 |
-
๐ผ You can upload a video or provide a video link. The generation is **limited to 10 seconds** to prevent errors with the queue in cpu. If you use a GPU, you won't have any of these limitations.
|
34 |
-
|
35 |
-
๐ For **translate a video of any duration** and faster results, you can use the Colab notebook with GPU.
|
36 |
|
37 |
-
|
38 |
-
|
39 |
-
"""
|
40 |
|
41 |
-
|
42 |
-
|
43 |
-
1. Upload a video on the first tab or use a video link on the second tab.
|
44 |
-
|
45 |
-
2. Choose the language in which you want to translate the video.
|
46 |
-
|
47 |
-
3. Specify the number of people speaking in the video and assign each one a text-to-speech voice suitable for the translation language.
|
48 |
-
|
49 |
-
4. Press the 'Translate' button to obtain the results.
|
50 |
-
|
51 |
-
"""
|
52 |
|
|
|
|
|
|
|
|
|
53 |
|
54 |
-
if not os.path.exists('audio'):
|
55 |
-
os.makedirs('audio')
|
56 |
|
57 |
-
if not os.path.exists('audio2/audio'):
|
58 |
-
os.makedirs('audio2/audio')
|
59 |
|
60 |
# Check GPU
|
61 |
if torch.cuda.is_available():
|
@@ -70,74 +64,151 @@ else:
|
|
70 |
whisper_model_default = 'base'
|
71 |
print('Working in: ', device)
|
72 |
|
|
|
73 |
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
### INIT
|
78 |
-
list_tts = ['af-ZA-AdriNeural-Female', 'af-ZA-WillemNeural-Male', 'am-ET-AmehaNeural-Male', 'am-ET-MekdesNeural-Female', 'ar-AE-FatimaNeural-Female', 'ar-AE-HamdanNeural-Male', 'ar-BH-AliNeural-Male', 'ar-BH-LailaNeural-Female', 'ar-DZ-AminaNeural-Female', 'ar-DZ-IsmaelNeural-Male', 'ar-EG-SalmaNeural-Female', 'ar-EG-ShakirNeural-Male', 'ar-IQ-BasselNeural-Male', 'ar-IQ-RanaNeural-Female', 'ar-JO-SanaNeural-Female', 'ar-JO-TaimNeural-Male', 'ar-KW-FahedNeural-Male', 'ar-KW-NouraNeural-Female', 'ar-LB-LaylaNeural-Female', 'ar-LB-RamiNeural-Male', 'ar-LY-ImanNeural-Female', 'ar-LY-OmarNeural-Male', 'ar-MA-JamalNeural-Male', 'ar-MA-MounaNeural-Female', 'ar-OM-AbdullahNeural-Male', 'ar-OM-AyshaNeural-Female', 'ar-QA-AmalNeural-Female', 'ar-QA-MoazNeural-Male', 'ar-SA-HamedNeural-Male', 'ar-SA-ZariyahNeural-Female', 'ar-SY-AmanyNeural-Female', 'ar-SY-LaithNeural-Male', 'ar-TN-HediNeural-Male', 'ar-TN-ReemNeural-Female', 'ar-YE-MaryamNeural-Female', 'ar-YE-SalehNeural-Male', 'az-AZ-BabekNeural-Male', 'az-AZ-BanuNeural-Female', 'bg-BG-BorislavNeural-Male', 'bg-BG-KalinaNeural-Female', 'bn-BD-NabanitaNeural-Female', 'bn-BD-PradeepNeural-Male', 'bn-IN-BashkarNeural-Male', 'bn-IN-TanishaaNeural-Female', 'bs-BA-GoranNeural-Male', 'bs-BA-VesnaNeural-Female', 'ca-ES-EnricNeural-Male', 'ca-ES-JoanaNeural-Female', 'cs-CZ-AntoninNeural-Male', 'cs-CZ-VlastaNeural-Female', 'cy-GB-AledNeural-Male', 'cy-GB-NiaNeural-Female', 'da-DK-ChristelNeural-Female', 'da-DK-JeppeNeural-Male', 'de-AT-IngridNeural-Female', 'de-AT-JonasNeural-Male', 'de-CH-JanNeural-Male', 'de-CH-LeniNeural-Female', 'de-DE-AmalaNeural-Female', 'de-DE-ConradNeural-Male', 'de-DE-KatjaNeural-Female', 'de-DE-KillianNeural-Male', 'el-GR-AthinaNeural-Female', 'el-GR-NestorasNeural-Male', 'en-AU-NatashaNeural-Female', 'en-AU-WilliamNeural-Male', 'en-CA-ClaraNeural-Female', 'en-CA-LiamNeural-Male', 'en-GB-LibbyNeural-Female', 'en-GB-MaisieNeural-Female', 'en-GB-RyanNeural-Male', 'en-GB-SoniaNeural-Female', 'en-GB-ThomasNeural-Male', 'en-HK-SamNeural-Male', 'en-HK-YanNeural-Female', 'en-IE-ConnorNeural-Male', 'en-IE-EmilyNeural-Female', 'en-IN-NeerjaExpressiveNeural-Female', 'en-IN-NeerjaNeural-Female', 'en-IN-PrabhatNeural-Male', 'en-KE-AsiliaNeural-Female', 'en-KE-ChilembaNeural-Male', 'en-NG-AbeoNeural-Male', 'en-NG-EzinneNeural-Female', 'en-NZ-MitchellNeural-Male', 'en-NZ-MollyNeural-Female', 'en-PH-JamesNeural-Male', 'en-PH-RosaNeural-Female', 'en-SG-LunaNeural-Female', 'en-SG-WayneNeural-Male', 'en-TZ-ElimuNeural-Male', 'en-TZ-ImaniNeural-Female', 'en-US-AnaNeural-Female', 'en-US-AriaNeural-Female', 'en-US-ChristopherNeural-Male', 'en-US-EricNeural-Male', 'en-US-GuyNeural-Male', 'en-US-JennyNeural-Female', 'en-US-MichelleNeural-Female', 'en-US-RogerNeural-Male', 'en-US-SteffanNeural-Male', 'en-ZA-LeahNeural-Female', 'en-ZA-LukeNeural-Male', 'es-AR-ElenaNeural-Female', 'es-AR-TomasNeural-Male', 'es-BO-MarceloNeural-Male', 'es-BO-SofiaNeural-Female', 'es-CL-CatalinaNeural-Female', 'es-CL-LorenzoNeural-Male', 'es-CO-GonzaloNeural-Male', 'es-CO-SalomeNeural-Female', 'es-CR-JuanNeural-Male', 'es-CR-MariaNeural-Female', 'es-CU-BelkysNeural-Female', 'es-CU-ManuelNeural-Male', 'es-DO-EmilioNeural-Male', 'es-DO-RamonaNeural-Female', 'es-EC-AndreaNeural-Female', 'es-EC-LuisNeural-Male', 'es-ES-AlvaroNeural-Male', 'es-ES-ElviraNeural-Female', 'es-GQ-JavierNeural-Male', 'es-GQ-TeresaNeural-Female', 'es-GT-AndresNeural-Male', 'es-GT-MartaNeural-Female', 'es-HN-CarlosNeural-Male', 'es-HN-KarlaNeural-Female', 'es-MX-DaliaNeural-Female', 'es-MX-JorgeNeural-Male', 'es-NI-FedericoNeural-Male', 'es-NI-YolandaNeural-Female', 'es-PA-MargaritaNeural-Female', 'es-PA-RobertoNeural-Male', 'es-PE-AlexNeural-Male', 'es-PE-CamilaNeural-Female', 'es-PR-KarinaNeural-Female', 'es-PR-VictorNeural-Male', 'es-PY-MarioNeural-Male', 'es-PY-TaniaNeural-Female', 'es-SV-LorenaNeural-Female', 'es-SV-RodrigoNeural-Male', 'es-US-AlonsoNeural-Male', 'es-US-PalomaNeural-Female', 'es-UY-MateoNeural-Male', 'es-UY-ValentinaNeural-Female', 'es-VE-PaolaNeural-Female', 'es-VE-SebastianNeural-Male', 'et-EE-AnuNeural-Female', 'et-EE-KertNeural-Male', 'fa-IR-DilaraNeural-Female', 'fa-IR-FaridNeural-Male', 'fi-FI-HarriNeural-Male', 'fi-FI-NooraNeural-Female', 'fil-PH-AngeloNeural-Male', 'fil-PH-BlessicaNeural-Female', 'fr-BE-CharlineNeural-Female', 'fr-BE-GerardNeural-Male', 'fr-CA-AntoineNeural-Male', 'fr-CA-JeanNeural-Male', 'fr-CA-SylvieNeural-Female', 'fr-CH-ArianeNeural-Female', 'fr-CH-FabriceNeural-Male', 'fr-FR-DeniseNeural-Female', 'fr-FR-EloiseNeural-Female', 'fr-FR-HenriNeural-Male', 'ga-IE-ColmNeural-Male', 'ga-IE-OrlaNeural-Female', 'gl-ES-RoiNeural-Male', 'gl-ES-SabelaNeural-Female', 'gu-IN-DhwaniNeural-Female', 'gu-IN-NiranjanNeural-Male', 'he-IL-AvriNeural-Male', 'he-IL-HilaNeural-Female', 'hi-IN-MadhurNeural-Male', 'hi-IN-SwaraNeural-Female', 'hr-HR-GabrijelaNeural-Female', 'hr-HR-SreckoNeural-Male', 'hu-HU-NoemiNeural-Female', 'hu-HU-TamasNeural-Male', 'id-ID-ArdiNeural-Male', 'id-ID-GadisNeural-Female', 'is-IS-GudrunNeural-Female', 'is-IS-GunnarNeural-Male', 'it-IT-DiegoNeural-Male', 'it-IT-ElsaNeural-Female', 'it-IT-IsabellaNeural-Female', 'ja-JP-KeitaNeural-Male', 'ja-JP-NanamiNeural-Female', 'jv-ID-DimasNeural-Male', 'jv-ID-SitiNeural-Female', 'ka-GE-EkaNeural-Female', 'ka-GE-GiorgiNeural-Male', 'kk-KZ-AigulNeural-Female', 'kk-KZ-DauletNeural-Male', 'km-KH-PisethNeural-Male', 'km-KH-SreymomNeural-Female', 'kn-IN-GaganNeural-Male', 'kn-IN-SapnaNeural-Female', 'ko-KR-InJoonNeural-Male', 'ko-KR-SunHiNeural-Female', 'lo-LA-ChanthavongNeural-Male', 'lo-LA-KeomanyNeural-Female', 'lt-LT-LeonasNeural-Male', 'lt-LT-OnaNeural-Female', 'lv-LV-EveritaNeural-Female', 'lv-LV-NilsNeural-Male', 'mk-MK-AleksandarNeural-Male', 'mk-MK-MarijaNeural-Female', 'ml-IN-MidhunNeural-Male', 'ml-IN-SobhanaNeural-Female', 'mn-MN-BataaNeural-Male', 'mn-MN-YesuiNeural-Female', 'mr-IN-AarohiNeural-Female', 'mr-IN-ManoharNeural-Male', 'ms-MY-OsmanNeural-Male', 'ms-MY-YasminNeural-Female', 'mt-MT-GraceNeural-Female', 'mt-MT-JosephNeural-Male', 'my-MM-NilarNeural-Female', 'my-MM-ThihaNeural-Male', 'nb-NO-FinnNeural-Male', 'nb-NO-PernilleNeural-Female', 'ne-NP-HemkalaNeural-Female', 'ne-NP-SagarNeural-Male', 'nl-BE-ArnaudNeural-Male', 'nl-BE-DenaNeural-Female', 'nl-NL-ColetteNeural-Female', 'nl-NL-FennaNeural-Female', 'nl-NL-MaartenNeural-Male', 'pl-PL-MarekNeural-Male', 'pl-PL-ZofiaNeural-Female', 'ps-AF-GulNawazNeural-Male', 'ps-AF-LatifaNeural-Female', 'pt-BR-AntonioNeural-Male', 'pt-BR-FranciscaNeural-Female', 'pt-PT-DuarteNeural-Male', 'pt-PT-RaquelNeural-Female', 'ro-RO-AlinaNeural-Female', 'ro-RO-EmilNeural-Male', 'ru-RU-DmitryNeural-Male', 'ru-RU-SvetlanaNeural-Female', 'si-LK-SameeraNeural-Male', 'si-LK-ThiliniNeural-Female', 'sk-SK-LukasNeural-Male', 'sk-SK-ViktoriaNeural-Female', 'sl-SI-PetraNeural-Female', 'sl-SI-RokNeural-Male', 'so-SO-MuuseNeural-Male', 'so-SO-UbaxNeural-Female', 'sq-AL-AnilaNeural-Female', 'sq-AL-IlirNeural-Male', 'sr-RS-NicholasNeural-Male', 'sr-RS-SophieNeural-Female', 'su-ID-JajangNeural-Male', 'su-ID-TutiNeural-Female', 'sv-SE-MattiasNeural-Male', 'sv-SE-SofieNeural-Female', 'sw-KE-RafikiNeural-Male', 'sw-KE-ZuriNeural-Female', 'sw-TZ-DaudiNeural-Male', 'sw-TZ-RehemaNeural-Female', 'ta-IN-PallaviNeural-Female', 'ta-IN-ValluvarNeural-Male', 'ta-LK-KumarNeural-Male', 'ta-LK-SaranyaNeural-Female', 'ta-MY-KaniNeural-Female', 'ta-MY-SuryaNeural-Male', 'ta-SG-AnbuNeural-Male', 'ta-SG-VenbaNeural-Female', 'te-IN-MohanNeural-Male', 'te-IN-ShrutiNeural-Female', 'th-TH-NiwatNeural-Male', 'th-TH-PremwadeeNeural-Female', 'tr-TR-AhmetNeural-Male', 'tr-TR-EmelNeural-Female', 'uk-UA-OstapNeural-Male', 'uk-UA-PolinaNeural-Female', 'ur-IN-GulNeural-Female', 'ur-IN-SalmanNeural-Male', 'ur-PK-AsadNeural-Male', 'ur-PK-UzmaNeural-Female', 'uz-UZ-MadinaNeural-Female', 'uz-UZ-SardorNeural-Male', 'vi-VN-HoaiMyNeural-Female', 'vi-VN-NamMinhNeural-Male', 'zh-CN-XiaoxiaoNeural-Female', 'zh-CN-XiaoyiNeural-Female', 'zh-CN-YunjianNeural-Male', 'zh-CN-YunxiNeural-Male', 'zh-CN-YunxiaNeural-Male', 'zh-CN-YunyangNeural-Male', 'zh-CN-liaoning-XiaobeiNeural-Female', 'zh-CN-shaanxi-XiaoniNeural-Female']
|
79 |
-
|
80 |
-
|
81 |
-
def translate_from_video(video, WHISPER_MODEL_SIZE, batch_size, compute_type,
|
82 |
TRANSLATE_AUDIO_TO, min_speakers, max_speakers,
|
83 |
tts_voice00, tts_voice01,tts_voice02,tts_voice03,tts_voice04,tts_voice05):
|
84 |
|
85 |
YOUR_HF_TOKEN = os.getenv("My_hf_token")
|
86 |
|
87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
OutputFile = 'Video.mp4'
|
89 |
audio_wav = "audio.wav"
|
90 |
-
Output_name_file = "audio_dub_solo.
|
91 |
mix_audio = "audio_mix.mp3"
|
92 |
-
|
93 |
-
|
94 |
-
os.system(f"rm {Output_name_file}")
|
95 |
os.system("rm Video.mp4")
|
96 |
-
|
97 |
os.system("rm audio.wav")
|
98 |
-
|
99 |
|
100 |
if os.path.exists(video):
|
101 |
-
|
102 |
-
|
103 |
-
# max 1 minute in cpu
|
104 |
-
print('10 s. Limited for CPU ')
|
105 |
os.system(f'ffmpeg -y -i "{video}" -ss 00:00:20 -t 00:00:10 -c:v libx264 -c:a aac -strict experimental Video.mp4')
|
106 |
else:
|
107 |
os.system(f'ffmpeg -y -i "{video}" -c:v libx264 -c:a aac -strict experimental Video.mp4')
|
108 |
-
|
109 |
os.system("ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav")
|
110 |
else:
|
111 |
-
|
112 |
-
|
113 |
-
# max 1 minute in cpu
|
114 |
-
print('10 s. Limited for CPU ')
|
115 |
#https://github.com/yt-dlp/yt-dlp/issues/2220
|
116 |
mp4_ = f'yt-dlp -f "mp4" --downloader ffmpeg --downloader-args "ffmpeg_i: -ss 00:00:20 -t 00:00:10" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
|
117 |
-
wav_ = "ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac
|
|
|
|
|
118 |
else:
|
119 |
mp4_ = f'yt-dlp -f "mp4" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
|
120 |
wav_ = f'python -m yt_dlp --output {audio_wav} --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --extract-audio --audio-format wav {video}'
|
121 |
-
|
122 |
-
|
123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
|
125 |
print("Set file complete.")
|
126 |
-
|
|
|
|
|
127 |
# 1. Transcribe with original whisper (batched)
|
128 |
model = whisperx.load_model(
|
129 |
WHISPER_MODEL_SIZE,
|
130 |
device,
|
131 |
-
compute_type=compute_type
|
|
|
132 |
)
|
133 |
audio = whisperx.load_audio(audio_wav)
|
134 |
result = model.transcribe(audio, batch_size=batch_size)
|
135 |
gc.collect(); torch.cuda.empty_cache(); del model
|
136 |
print("Transcript complete")
|
137 |
-
|
138 |
# 2. Align whisper output
|
139 |
model_a, metadata = whisperx.load_align_model(
|
140 |
-
language_code=result["language"],
|
141 |
device=device
|
142 |
)
|
143 |
result = whisperx.align(
|
@@ -150,7 +221,11 @@ def translate_from_video(video, WHISPER_MODEL_SIZE, batch_size, compute_type,
|
|
150 |
)
|
151 |
gc.collect(); torch.cuda.empty_cache(); del model_a
|
152 |
print("Align complete")
|
153 |
-
|
|
|
|
|
|
|
|
|
154 |
# 3. Assign speaker labels
|
155 |
diarize_model = whisperx.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device)
|
156 |
diarize_segments = diarize_model(
|
@@ -160,10 +235,10 @@ def translate_from_video(video, WHISPER_MODEL_SIZE, batch_size, compute_type,
|
|
160 |
result_diarize = whisperx.assign_word_speakers(diarize_segments, result)
|
161 |
gc.collect(); torch.cuda.empty_cache(); del diarize_model
|
162 |
print("Diarize complete")
|
163 |
-
|
164 |
result_diarize['segments'] = translate_text(result_diarize['segments'], TRANSLATE_AUDIO_TO)
|
165 |
print("Translation complete")
|
166 |
-
|
167 |
audio_files = []
|
168 |
|
169 |
# Mapping speakers to voice variables
|
@@ -176,7 +251,7 @@ def translate_from_video(video, WHISPER_MODEL_SIZE, batch_size, compute_type,
|
|
176 |
'SPEAKER_05': tts_voice05
|
177 |
}
|
178 |
|
179 |
-
for segment in result_diarize['segments']:
|
180 |
|
181 |
text = segment['text']
|
182 |
start = segment['start']
|
@@ -193,7 +268,7 @@ def translate_from_video(video, WHISPER_MODEL_SIZE, batch_size, compute_type,
|
|
193 |
filename = f"audio/{start}.ogg"
|
194 |
|
195 |
if speaker in speaker_to_voice and speaker_to_voice[speaker] != 'None':
|
196 |
-
make_voice_gradio(text, speaker_to_voice[speaker], filename)
|
197 |
elif speaker == "SPEAKER_99":
|
198 |
try:
|
199 |
tts = gTTS(text, lang=TRANSLATE_AUDIO_TO)
|
@@ -202,7 +277,7 @@ def translate_from_video(video, WHISPER_MODEL_SIZE, batch_size, compute_type,
|
|
202 |
except:
|
203 |
tts = gTTS('a', lang=TRANSLATE_AUDIO_TO)
|
204 |
tts.save(filename)
|
205 |
-
print('
|
206 |
|
207 |
# duration
|
208 |
duration_true = end - start
|
@@ -212,7 +287,7 @@ def translate_from_video(video, WHISPER_MODEL_SIZE, batch_size, compute_type,
|
|
212 |
porcentaje = duration_tts / duration_true
|
213 |
|
214 |
if porcentaje > 2.1:
|
215 |
-
porcentaje = 2.1
|
216 |
elif porcentaje <= 1.2 and porcentaje >= 0.8:
|
217 |
porcentaje = 1.0
|
218 |
elif porcentaje <= 0.79:
|
@@ -231,25 +306,26 @@ def translate_from_video(video, WHISPER_MODEL_SIZE, batch_size, compute_type,
|
|
231 |
os.system("mv -f audio2/audio/*.ogg audio/")
|
232 |
|
233 |
os.system(f"rm {Output_name_file}")
|
234 |
-
|
235 |
create_translated_audio(result_diarize, audio_files, Output_name_file)
|
236 |
|
237 |
-
os.system("rm audio_dub_stereo.wav")
|
238 |
-
os.system("ffmpeg -i audio_dub_solo.wav -ac 1 audio_dub_stereo.wav")
|
239 |
-
|
240 |
-
#os.system(f"ffmpeg -i Video.mp4 -i {Output_name_file} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output}")
|
241 |
-
|
242 |
os.system(f"rm {mix_audio}")
|
243 |
-
|
244 |
-
#
|
245 |
-
|
246 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
247 |
os.system(f"rm {video_output}")
|
248 |
os.system(f"ffmpeg -i {OutputFile} -i {mix_audio} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output}")
|
249 |
-
|
250 |
-
return video_output
|
251 |
-
|
252 |
|
|
|
253 |
|
254 |
import sys
|
255 |
|
@@ -276,52 +352,79 @@ def read_logs():
|
|
276 |
with open("output.log", "r") as f:
|
277 |
return f.read()
|
278 |
|
|
|
|
|
279 |
|
280 |
-
|
|
|
|
|
281 |
gr.Markdown(title)
|
282 |
gr.Markdown(description)
|
283 |
-
gr.Markdown(tutorial)
|
284 |
|
|
|
285 |
with gr.Tab("Translate audio from video"):
|
286 |
with gr.Row():
|
287 |
with gr.Column():
|
288 |
video_input = gr.Video() # height=300,width=300
|
289 |
-
|
290 |
-
gr.
|
291 |
-
|
292 |
-
|
293 |
gr.Markdown("Select how many people are speaking in the video.")
|
294 |
-
min_speakers = gr.
|
295 |
-
max_speakers = gr.
|
296 |
-
|
297 |
gr.Markdown("Select the voice you want for each speaker.")
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
gr.
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
311 |
with gr.Row():
|
312 |
video_button = gr.Button("TRANSLATE", )
|
313 |
with gr.Row():
|
314 |
video_output = gr.Video()
|
315 |
|
|
|
|
|
|
|
|
|
|
|
316 |
|
317 |
gr.Examples(
|
318 |
examples=[
|
319 |
[
|
320 |
-
"./assets/
|
|
|
|
|
321 |
"base",
|
322 |
16,
|
323 |
"float32",
|
324 |
-
"
|
|
|
325 |
1,
|
326 |
2,
|
327 |
'en-AU-WilliamNeural-Male',
|
@@ -330,15 +433,20 @@ with gr.Blocks() as demo:
|
|
330 |
'en-GB-SoniaNeural-Female',
|
331 |
'en-NZ-MitchellNeural-Male',
|
332 |
'en-GB-MaisieNeural-Female',
|
|
|
|
|
333 |
],
|
334 |
],
|
335 |
fn=translate_from_video,
|
336 |
inputs=[
|
337 |
video_input,
|
338 |
-
|
|
|
|
|
339 |
batch_size,
|
340 |
-
compute_type,
|
341 |
-
|
|
|
342 |
min_speakers,
|
343 |
max_speakers,
|
344 |
tts_voice00,
|
@@ -347,58 +455,119 @@ with gr.Blocks() as demo:
|
|
347 |
tts_voice03,
|
348 |
tts_voice04,
|
349 |
tts_voice05,
|
|
|
|
|
350 |
],
|
351 |
outputs=[video_output],
|
352 |
-
cache_examples=
|
353 |
)
|
354 |
|
|
|
355 |
|
356 |
with gr.Tab("Translate audio from video link"):
|
357 |
with gr.Row():
|
358 |
with gr.Column():
|
359 |
-
|
360 |
-
|
361 |
-
#
|
362 |
-
|
363 |
-
gr.
|
364 |
-
|
365 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
366 |
gr.Markdown("Select how many people are speaking in the video.")
|
367 |
-
bmin_speakers = gr.
|
368 |
-
bmax_speakers = gr.
|
369 |
-
|
370 |
gr.Markdown("Select the voice you want for each speaker.")
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
gr.
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
383 |
# text_button = gr.Button("Translate audio of video")
|
384 |
# link_output = gr.Video() #gr.outputs.File(label="Download!")
|
385 |
|
386 |
|
387 |
|
388 |
-
with gr.Column(variant='compact'):
|
389 |
with gr.Row():
|
390 |
text_button = gr.Button("TRANSLATE")
|
391 |
with gr.Row():
|
392 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
393 |
|
394 |
gr.Examples(
|
395 |
examples=[
|
396 |
[
|
397 |
"https://www.youtube.com/watch?v=5ZeHtRKHl7Y",
|
|
|
|
|
398 |
"base",
|
399 |
16,
|
400 |
"float32",
|
401 |
-
"
|
|
|
402 |
1,
|
403 |
2,
|
404 |
'en-CA-ClaraNeural-Female',
|
@@ -407,15 +576,20 @@ with gr.Blocks() as demo:
|
|
407 |
'en-GB-SoniaNeural-Female',
|
408 |
'en-NZ-MitchellNeural-Male',
|
409 |
'en-GB-MaisieNeural-Female',
|
|
|
|
|
410 |
],
|
411 |
],
|
412 |
fn=translate_from_video,
|
413 |
inputs=[
|
414 |
-
|
415 |
-
|
|
|
|
|
416 |
bbatch_size,
|
417 |
-
bcompute_type,
|
418 |
-
|
|
|
419 |
bmin_speakers,
|
420 |
bmax_speakers,
|
421 |
btts_voice00,
|
@@ -424,24 +598,34 @@ with gr.Blocks() as demo:
|
|
424 |
btts_voice03,
|
425 |
btts_voice04,
|
426 |
btts_voice05,
|
|
|
|
|
427 |
],
|
428 |
-
outputs=[
|
429 |
-
cache_examples=
|
430 |
)
|
431 |
|
432 |
|
433 |
-
|
434 |
-
|
|
|
|
|
|
|
|
|
|
|
435 |
logs = gr.Textbox()
|
436 |
demo.load(read_logs, None, logs, every=1)
|
437 |
|
438 |
# run
|
439 |
video_button.click(translate_from_video, inputs=[
|
440 |
-
video_input,
|
441 |
-
|
|
|
|
|
442 |
batch_size,
|
443 |
-
compute_type,
|
444 |
-
|
|
|
445 |
min_speakers,
|
446 |
max_speakers,
|
447 |
tts_voice00,
|
@@ -449,13 +633,19 @@ with gr.Blocks() as demo:
|
|
449 |
tts_voice02,
|
450 |
tts_voice03,
|
451 |
tts_voice04,
|
452 |
-
tts_voice05,
|
|
|
|
|
|
|
453 |
text_button.click(translate_from_video, inputs=[
|
454 |
-
|
455 |
-
|
|
|
|
|
456 |
bbatch_size,
|
457 |
-
bcompute_type,
|
458 |
-
|
|
|
459 |
bmin_speakers,
|
460 |
bmax_speakers,
|
461 |
btts_voice00,
|
@@ -463,11 +653,10 @@ with gr.Blocks() as demo:
|
|
463 |
btts_voice02,
|
464 |
btts_voice03,
|
465 |
btts_voice04,
|
466 |
-
btts_voice05,
|
467 |
-
|
|
|
|
|
468 |
|
469 |
demo.launch(enable_queue=True)
|
470 |
-
|
471 |
-
|
472 |
-
|
473 |
-
|
|
|
1 |
+
#%cd SoniTranslate
|
|
|
|
|
2 |
import numpy as np
|
3 |
import gradio as gr
|
4 |
import whisperx
|
|
|
15 |
from soni_translate.audio_segments import create_translated_audio
|
16 |
from soni_translate.text_to_speech import make_voice_gradio
|
17 |
from soni_translate.translate_segments import translate_text
|
|
|
18 |
|
19 |
title = "<center><strong><font size='7'>๐ฝ๏ธ SoniTranslate ๐ท๏ธ</font></strong></center>"
|
20 |
|
21 |
news = """ ## ๐ News
|
22 |
+
๐ฅ 2023/07/26: new UI and mix options add.
|
23 |
+
"""
|
24 |
+
|
25 |
+
description = """
|
26 |
+
### ๐ฅ **Translate videos easily with SoniTranslate!** ๐ฝ๏ธ
|
27 |
+
|
28 |
+
Upload a video or provide a video link. Limitation: 10 seconds for CPU, but no restrictions with a GPU.
|
29 |
+
|
30 |
+
For faster results and no duration limits, try the Colab notebook with a GPU:
|
31 |
+
[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://github.com/R3gm/SoniTranslate/blob/main/SoniTranslate_Colab.ipynb)
|
32 |
+
|
33 |
+
๐ฝ๏ธ **This a demo of SoniTranslate; GitHub repository: [SoniTranslate](https://github.com/R3gm/SoniTranslate)!**
|
34 |
+
|
35 |
+
See the tab labeled 'Help' for instructions on how to use it. Let's start having fun with video translation! ๐๐
|
36 |
+
"""
|
37 |
|
|
|
38 |
|
39 |
|
40 |
+
tutorial = """
|
41 |
+
## ๐ฐ **Instructions for use:**
|
|
|
|
|
|
|
42 |
|
43 |
+
1. ๐ค **Upload a video** on the first tab or ๐ **use a video link** on the second tab.
|
|
|
|
|
44 |
|
45 |
+
2. ๐ Choose the language in which you want to **translate the video**.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
+
3. ๐ฃ๏ธ Specify the **number of people speaking** in the video and **assign each one a text-to-speech voice** suitable for the translation language.
|
48 |
+
|
49 |
+
4. ๐ Press the '**Translate**' button to obtain the results.
|
50 |
+
"""
|
51 |
|
|
|
|
|
52 |
|
|
|
|
|
53 |
|
54 |
# Check GPU
|
55 |
if torch.cuda.is_available():
|
|
|
64 |
whisper_model_default = 'base'
|
65 |
print('Working in: ', device)
|
66 |
|
67 |
+
list_tts = ['af-ZA-AdriNeural-Female', 'af-ZA-WillemNeural-Male', 'am-ET-AmehaNeural-Male', 'am-ET-MekdesNeural-Female', 'ar-AE-FatimaNeural-Female', 'ar-AE-HamdanNeural-Male', 'ar-BH-AliNeural-Male', 'ar-BH-LailaNeural-Female', 'ar-DZ-AminaNeural-Female', 'ar-DZ-IsmaelNeural-Male', 'ar-EG-SalmaNeural-Female', 'ar-EG-ShakirNeural-Male', 'ar-IQ-BasselNeural-Male', 'ar-IQ-RanaNeural-Female', 'ar-JO-SanaNeural-Female', 'ar-JO-TaimNeural-Male', 'ar-KW-FahedNeural-Male', 'ar-KW-NouraNeural-Female', 'ar-LB-LaylaNeural-Female', 'ar-LB-RamiNeural-Male', 'ar-LY-ImanNeural-Female', 'ar-LY-OmarNeural-Male', 'ar-MA-JamalNeural-Male', 'ar-MA-MounaNeural-Female', 'ar-OM-AbdullahNeural-Male', 'ar-OM-AyshaNeural-Female', 'ar-QA-AmalNeural-Female', 'ar-QA-MoazNeural-Male', 'ar-SA-HamedNeural-Male', 'ar-SA-ZariyahNeural-Female', 'ar-SY-AmanyNeural-Female', 'ar-SY-LaithNeural-Male', 'ar-TN-HediNeural-Male', 'ar-TN-ReemNeural-Female', 'ar-YE-MaryamNeural-Female', 'ar-YE-SalehNeural-Male', 'az-AZ-BabekNeural-Male', 'az-AZ-BanuNeural-Female', 'bg-BG-BorislavNeural-Male', 'bg-BG-KalinaNeural-Female', 'bn-BD-NabanitaNeural-Female', 'bn-BD-PradeepNeural-Male', 'bn-IN-BashkarNeural-Male', 'bn-IN-TanishaaNeural-Female', 'bs-BA-GoranNeural-Male', 'bs-BA-VesnaNeural-Female', 'ca-ES-EnricNeural-Male', 'ca-ES-JoanaNeural-Female', 'cs-CZ-AntoninNeural-Male', 'cs-CZ-VlastaNeural-Female', 'cy-GB-AledNeural-Male', 'cy-GB-NiaNeural-Female', 'da-DK-ChristelNeural-Female', 'da-DK-JeppeNeural-Male', 'de-AT-IngridNeural-Female', 'de-AT-JonasNeural-Male', 'de-CH-JanNeural-Male', 'de-CH-LeniNeural-Female', 'de-DE-AmalaNeural-Female', 'de-DE-ConradNeural-Male', 'de-DE-KatjaNeural-Female', 'de-DE-KillianNeural-Male', 'el-GR-AthinaNeural-Female', 'el-GR-NestorasNeural-Male', 'en-AU-NatashaNeural-Female', 'en-AU-WilliamNeural-Male', 'en-CA-ClaraNeural-Female', 'en-CA-LiamNeural-Male', 'en-GB-LibbyNeural-Female', 'en-GB-MaisieNeural-Female', 'en-GB-RyanNeural-Male', 'en-GB-SoniaNeural-Female', 'en-GB-ThomasNeural-Male', 'en-HK-SamNeural-Male', 'en-HK-YanNeural-Female', 'en-IE-ConnorNeural-Male', 'en-IE-EmilyNeural-Female', 'en-IN-NeerjaExpressiveNeural-Female', 'en-IN-NeerjaNeural-Female', 'en-IN-PrabhatNeural-Male', 'en-KE-AsiliaNeural-Female', 'en-KE-ChilembaNeural-Male', 'en-NG-AbeoNeural-Male', 'en-NG-EzinneNeural-Female', 'en-NZ-MitchellNeural-Male', 'en-NZ-MollyNeural-Female', 'en-PH-JamesNeural-Male', 'en-PH-RosaNeural-Female', 'en-SG-LunaNeural-Female', 'en-SG-WayneNeural-Male', 'en-TZ-ElimuNeural-Male', 'en-TZ-ImaniNeural-Female', 'en-US-AnaNeural-Female', 'en-US-AriaNeural-Female', 'en-US-ChristopherNeural-Male', 'en-US-EricNeural-Male', 'en-US-GuyNeural-Male', 'en-US-JennyNeural-Female', 'en-US-MichelleNeural-Female', 'en-US-RogerNeural-Male', 'en-US-SteffanNeural-Male', 'en-ZA-LeahNeural-Female', 'en-ZA-LukeNeural-Male', 'es-AR-ElenaNeural-Female', 'es-AR-TomasNeural-Male', 'es-BO-MarceloNeural-Male', 'es-BO-SofiaNeural-Female', 'es-CL-CatalinaNeural-Female', 'es-CL-LorenzoNeural-Male', 'es-CO-GonzaloNeural-Male', 'es-CO-SalomeNeural-Female', 'es-CR-JuanNeural-Male', 'es-CR-MariaNeural-Female', 'es-CU-BelkysNeural-Female', 'es-CU-ManuelNeural-Male', 'es-DO-EmilioNeural-Male', 'es-DO-RamonaNeural-Female', 'es-EC-AndreaNeural-Female', 'es-EC-LuisNeural-Male', 'es-ES-AlvaroNeural-Male', 'es-ES-ElviraNeural-Female', 'es-GQ-JavierNeural-Male', 'es-GQ-TeresaNeural-Female', 'es-GT-AndresNeural-Male', 'es-GT-MartaNeural-Female', 'es-HN-CarlosNeural-Male', 'es-HN-KarlaNeural-Female', 'es-MX-DaliaNeural-Female', 'es-MX-JorgeNeural-Male', 'es-NI-FedericoNeural-Male', 'es-NI-YolandaNeural-Female', 'es-PA-MargaritaNeural-Female', 'es-PA-RobertoNeural-Male', 'es-PE-AlexNeural-Male', 'es-PE-CamilaNeural-Female', 'es-PR-KarinaNeural-Female', 'es-PR-VictorNeural-Male', 'es-PY-MarioNeural-Male', 'es-PY-TaniaNeural-Female', 'es-SV-LorenaNeural-Female', 'es-SV-RodrigoNeural-Male', 'es-US-AlonsoNeural-Male', 'es-US-PalomaNeural-Female', 'es-UY-MateoNeural-Male', 'es-UY-ValentinaNeural-Female', 'es-VE-PaolaNeural-Female', 'es-VE-SebastianNeural-Male', 'et-EE-AnuNeural-Female', 'et-EE-KertNeural-Male', 'fa-IR-DilaraNeural-Female', 'fa-IR-FaridNeural-Male', 'fi-FI-HarriNeural-Male', 'fi-FI-NooraNeural-Female', 'fil-PH-AngeloNeural-Male', 'fil-PH-BlessicaNeural-Female', 'fr-BE-CharlineNeural-Female', 'fr-BE-GerardNeural-Male', 'fr-CA-AntoineNeural-Male', 'fr-CA-JeanNeural-Male', 'fr-CA-SylvieNeural-Female', 'fr-CH-ArianeNeural-Female', 'fr-CH-FabriceNeural-Male', 'fr-FR-DeniseNeural-Female', 'fr-FR-EloiseNeural-Female', 'fr-FR-HenriNeural-Male', 'ga-IE-ColmNeural-Male', 'ga-IE-OrlaNeural-Female', 'gl-ES-RoiNeural-Male', 'gl-ES-SabelaNeural-Female', 'gu-IN-DhwaniNeural-Female', 'gu-IN-NiranjanNeural-Male', 'he-IL-AvriNeural-Male', 'he-IL-HilaNeural-Female', 'hi-IN-MadhurNeural-Male', 'hi-IN-SwaraNeural-Female', 'hr-HR-GabrijelaNeural-Female', 'hr-HR-SreckoNeural-Male', 'hu-HU-NoemiNeural-Female', 'hu-HU-TamasNeural-Male', 'id-ID-ArdiNeural-Male', 'id-ID-GadisNeural-Female', 'is-IS-GudrunNeural-Female', 'is-IS-GunnarNeural-Male', 'it-IT-DiegoNeural-Male', 'it-IT-ElsaNeural-Female', 'it-IT-IsabellaNeural-Female', 'ja-JP-KeitaNeural-Male', 'ja-JP-NanamiNeural-Female', 'jv-ID-DimasNeural-Male', 'jv-ID-SitiNeural-Female', 'ka-GE-EkaNeural-Female', 'ka-GE-GiorgiNeural-Male', 'kk-KZ-AigulNeural-Female', 'kk-KZ-DauletNeural-Male', 'km-KH-PisethNeural-Male', 'km-KH-SreymomNeural-Female', 'kn-IN-GaganNeural-Male', 'kn-IN-SapnaNeural-Female', 'ko-KR-InJoonNeural-Male', 'ko-KR-SunHiNeural-Female', 'lo-LA-ChanthavongNeural-Male', 'lo-LA-KeomanyNeural-Female', 'lt-LT-LeonasNeural-Male', 'lt-LT-OnaNeural-Female', 'lv-LV-EveritaNeural-Female', 'lv-LV-NilsNeural-Male', 'mk-MK-AleksandarNeural-Male', 'mk-MK-MarijaNeural-Female', 'ml-IN-MidhunNeural-Male', 'ml-IN-SobhanaNeural-Female', 'mn-MN-BataaNeural-Male', 'mn-MN-YesuiNeural-Female', 'mr-IN-AarohiNeural-Female', 'mr-IN-ManoharNeural-Male', 'ms-MY-OsmanNeural-Male', 'ms-MY-YasminNeural-Female', 'mt-MT-GraceNeural-Female', 'mt-MT-JosephNeural-Male', 'my-MM-NilarNeural-Female', 'my-MM-ThihaNeural-Male', 'nb-NO-FinnNeural-Male', 'nb-NO-PernilleNeural-Female', 'ne-NP-HemkalaNeural-Female', 'ne-NP-SagarNeural-Male', 'nl-BE-ArnaudNeural-Male', 'nl-BE-DenaNeural-Female', 'nl-NL-ColetteNeural-Female', 'nl-NL-FennaNeural-Female', 'nl-NL-MaartenNeural-Male', 'pl-PL-MarekNeural-Male', 'pl-PL-ZofiaNeural-Female', 'ps-AF-GulNawazNeural-Male', 'ps-AF-LatifaNeural-Female', 'pt-BR-AntonioNeural-Male', 'pt-BR-FranciscaNeural-Female', 'pt-PT-DuarteNeural-Male', 'pt-PT-RaquelNeural-Female', 'ro-RO-AlinaNeural-Female', 'ro-RO-EmilNeural-Male', 'ru-RU-DmitryNeural-Male', 'ru-RU-SvetlanaNeural-Female', 'si-LK-SameeraNeural-Male', 'si-LK-ThiliniNeural-Female', 'sk-SK-LukasNeural-Male', 'sk-SK-ViktoriaNeural-Female', 'sl-SI-PetraNeural-Female', 'sl-SI-RokNeural-Male', 'so-SO-MuuseNeural-Male', 'so-SO-UbaxNeural-Female', 'sq-AL-AnilaNeural-Female', 'sq-AL-IlirNeural-Male', 'sr-RS-NicholasNeural-Male', 'sr-RS-SophieNeural-Female', 'su-ID-JajangNeural-Male', 'su-ID-TutiNeural-Female', 'sv-SE-MattiasNeural-Male', 'sv-SE-SofieNeural-Female', 'sw-KE-RafikiNeural-Male', 'sw-KE-ZuriNeural-Female', 'sw-TZ-DaudiNeural-Male', 'sw-TZ-RehemaNeural-Female', 'ta-IN-PallaviNeural-Female', 'ta-IN-ValluvarNeural-Male', 'ta-LK-KumarNeural-Male', 'ta-LK-SaranyaNeural-Female', 'ta-MY-KaniNeural-Female', 'ta-MY-SuryaNeural-Male', 'ta-SG-AnbuNeural-Male', 'ta-SG-VenbaNeural-Female', 'te-IN-MohanNeural-Male', 'te-IN-ShrutiNeural-Female', 'th-TH-NiwatNeural-Male', 'th-TH-PremwadeeNeural-Female', 'tr-TR-AhmetNeural-Male', 'tr-TR-EmelNeural-Female', 'uk-UA-OstapNeural-Male', 'uk-UA-PolinaNeural-Female', 'ur-IN-GulNeural-Female', 'ur-IN-SalmanNeural-Male', 'ur-PK-AsadNeural-Male', 'ur-PK-UzmaNeural-Female', 'uz-UZ-MadinaNeural-Female', 'uz-UZ-SardorNeural-Male', 'vi-VN-HoaiMyNeural-Female', 'vi-VN-NamMinhNeural-Male', 'zh-CN-XiaoxiaoNeural-Female', 'zh-CN-XiaoyiNeural-Female', 'zh-CN-YunjianNeural-Male', 'zh-CN-YunxiNeural-Male', 'zh-CN-YunxiaNeural-Male', 'zh-CN-YunyangNeural-Male', 'zh-CN-liaoning-XiaobeiNeural-Female', 'zh-CN-shaanxi-XiaoniNeural-Female']
|
68 |
|
69 |
+
'''
|
70 |
+
def translate_from_video(video, WHISPER_MODEL_SIZE, batch_size, compute_type,
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
TRANSLATE_AUDIO_TO, min_speakers, max_speakers,
|
72 |
tts_voice00, tts_voice01,tts_voice02,tts_voice03,tts_voice04,tts_voice05):
|
73 |
|
74 |
YOUR_HF_TOKEN = os.getenv("My_hf_token")
|
75 |
|
76 |
+
create_translated_audio(result_diarize, audio_files, Output_name_file)
|
77 |
+
|
78 |
+
os.system("rm audio_dub_stereo.wav")
|
79 |
+
os.system("ffmpeg -i audio_dub_solo.wav -ac 1 audio_dub_stereo.wav")
|
80 |
+
|
81 |
+
os.system(f"rm {mix_audio}")
|
82 |
+
os.system(f'ffmpeg -y -i audio.wav -i audio_dub_stereo.wav -filter_complex "[0:0]volume=0.15[a];[1:0]volume=1.90[b];[a][b]amix=inputs=2:duration=longest" -c:a libmp3lame {mix_audio}')
|
83 |
+
|
84 |
+
os.system(f"rm {video_output}")
|
85 |
+
os.system(f"ffmpeg -i {OutputFile} -i {mix_audio} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output}")
|
86 |
+
|
87 |
+
return video_output
|
88 |
+
'''
|
89 |
+
|
90 |
+
def translate_from_video(
|
91 |
+
video,
|
92 |
+
YOUR_HF_TOKEN,
|
93 |
+
preview=False,
|
94 |
+
WHISPER_MODEL_SIZE="large-v1",
|
95 |
+
batch_size=16,
|
96 |
+
compute_type="float16",
|
97 |
+
SOURCE_LANGUAGE= "Automatic detection",
|
98 |
+
TRANSLATE_AUDIO_TO="English (en)",
|
99 |
+
min_speakers=1,
|
100 |
+
max_speakers=2,
|
101 |
+
tts_voice00="en-AU-WilliamNeural-Male",
|
102 |
+
tts_voice01="en-CA-ClaraNeural-Female",
|
103 |
+
tts_voice02="en-GB-ThomasNeural-Male",
|
104 |
+
tts_voice03="en-GB-SoniaNeural-Female",
|
105 |
+
tts_voice04="en-NZ-MitchellNeural-Male",
|
106 |
+
tts_voice05="en-GB-MaisieNeural-Female",
|
107 |
+
video_output="video_dub.mp4",
|
108 |
+
AUDIO_MIX_METHOD='Adjusting volumes and mixing audio',
|
109 |
+
):
|
110 |
+
|
111 |
+
if YOUR_HF_TOKEN == "" or YOUR_HF_TOKEN == None:
|
112 |
+
YOUR_HF_TOKEN = os.getenv("YOUR_HF_TOKEN")
|
113 |
+
if YOUR_HF_TOKEN == None:
|
114 |
+
print('No valid token')
|
115 |
+
return
|
116 |
+
|
117 |
+
if "SET_LIMIT" == os.getenv("DEMO"):
|
118 |
+
preview=True
|
119 |
+
print("DEMO; set preview=True; The generation is **limited to 10 seconds** to prevent errors with the CPU. If you use a GPU, you won't have any of these limitations.")
|
120 |
+
AUDIO_MIX_METHOD='Adjusting volumes and mixing audio'
|
121 |
+
print("DEMO; set Adjusting volumes and mixing audio")
|
122 |
+
|
123 |
+
LANGUAGES = {
|
124 |
+
'Automatic detection': 'Automatic detection',
|
125 |
+
'English (en)': 'en',
|
126 |
+
'French (fr)': 'fr',
|
127 |
+
'German (de)': 'de',
|
128 |
+
'Spanish (es)': 'es',
|
129 |
+
'Italian (it)': 'it',
|
130 |
+
'Japanese (ja)': 'ja',
|
131 |
+
'Chinese (zh)': 'zh',
|
132 |
+
'Dutch (nl)': 'nl',
|
133 |
+
'Ukrainian (uk)': 'uk',
|
134 |
+
'Portuguese (pt)': 'pt'
|
135 |
+
}
|
136 |
+
|
137 |
+
TRANSLATE_AUDIO_TO = LANGUAGES[TRANSLATE_AUDIO_TO]
|
138 |
+
SOURCE_LANGUAGE = LANGUAGES[SOURCE_LANGUAGE]
|
139 |
+
|
140 |
+
|
141 |
+
if not os.path.exists('audio'):
|
142 |
+
os.makedirs('audio')
|
143 |
+
|
144 |
+
if not os.path.exists('audio2/audio'):
|
145 |
+
os.makedirs('audio2/audio')
|
146 |
+
|
147 |
+
# Check GPU
|
148 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
149 |
+
compute_type = "float32" if device == "cpu" else compute_type
|
150 |
+
|
151 |
OutputFile = 'Video.mp4'
|
152 |
audio_wav = "audio.wav"
|
153 |
+
Output_name_file = "audio_dub_solo.ogg"
|
154 |
mix_audio = "audio_mix.mp3"
|
155 |
+
|
|
|
|
|
156 |
os.system("rm Video.mp4")
|
157 |
+
os.system("rm audio.webm")
|
158 |
os.system("rm audio.wav")
|
|
|
159 |
|
160 |
if os.path.exists(video):
|
161 |
+
if preview:
|
162 |
+
print('Creating a preview video of 10 seconds, to disable this option, go to advanced settings and turn off preview.')
|
|
|
|
|
163 |
os.system(f'ffmpeg -y -i "{video}" -ss 00:00:20 -t 00:00:10 -c:v libx264 -c:a aac -strict experimental Video.mp4')
|
164 |
else:
|
165 |
os.system(f'ffmpeg -y -i "{video}" -c:v libx264 -c:a aac -strict experimental Video.mp4')
|
166 |
+
|
167 |
os.system("ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav")
|
168 |
else:
|
169 |
+
if preview:
|
170 |
+
print('Creating a preview from the link, 10 seconds to disable this option, go to advanced settings and turn off preview.')
|
|
|
|
|
171 |
#https://github.com/yt-dlp/yt-dlp/issues/2220
|
172 |
mp4_ = f'yt-dlp -f "mp4" --downloader ffmpeg --downloader-args "ffmpeg_i: -ss 00:00:20 -t 00:00:10" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
|
173 |
+
wav_ = "ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav"
|
174 |
+
os.system(mp4_)
|
175 |
+
os.system(wav_)
|
176 |
else:
|
177 |
mp4_ = f'yt-dlp -f "mp4" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
|
178 |
wav_ = f'python -m yt_dlp --output {audio_wav} --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --extract-audio --audio-format wav {video}'
|
179 |
+
|
180 |
+
os.system(wav_)
|
181 |
+
|
182 |
+
for i in range (120):
|
183 |
+
time.sleep(1)
|
184 |
+
print('process audio...')
|
185 |
+
if os.path.exists(audio_wav) and not os.path.exists('audio.webm'):
|
186 |
+
time.sleep(1)
|
187 |
+
os.system(mp4_)
|
188 |
+
break
|
189 |
+
if i == 119:
|
190 |
+
print('Error donwloading the audio')
|
191 |
+
return
|
192 |
|
193 |
print("Set file complete.")
|
194 |
+
|
195 |
+
SOURCE_LANGUAGE = None if SOURCE_LANGUAGE == 'Automatic detection' else SOURCE_LANGUAGE
|
196 |
+
|
197 |
# 1. Transcribe with original whisper (batched)
|
198 |
model = whisperx.load_model(
|
199 |
WHISPER_MODEL_SIZE,
|
200 |
device,
|
201 |
+
compute_type=compute_type,
|
202 |
+
language= SOURCE_LANGUAGE,
|
203 |
)
|
204 |
audio = whisperx.load_audio(audio_wav)
|
205 |
result = model.transcribe(audio, batch_size=batch_size)
|
206 |
gc.collect(); torch.cuda.empty_cache(); del model
|
207 |
print("Transcript complete")
|
208 |
+
|
209 |
# 2. Align whisper output
|
210 |
model_a, metadata = whisperx.load_align_model(
|
211 |
+
language_code=result["language"],
|
212 |
device=device
|
213 |
)
|
214 |
result = whisperx.align(
|
|
|
221 |
)
|
222 |
gc.collect(); torch.cuda.empty_cache(); del model_a
|
223 |
print("Align complete")
|
224 |
+
|
225 |
+
if result['segments'] == []:
|
226 |
+
print('No active speech found in audio')
|
227 |
+
return
|
228 |
+
|
229 |
# 3. Assign speaker labels
|
230 |
diarize_model = whisperx.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device)
|
231 |
diarize_segments = diarize_model(
|
|
|
235 |
result_diarize = whisperx.assign_word_speakers(diarize_segments, result)
|
236 |
gc.collect(); torch.cuda.empty_cache(); del diarize_model
|
237 |
print("Diarize complete")
|
238 |
+
|
239 |
result_diarize['segments'] = translate_text(result_diarize['segments'], TRANSLATE_AUDIO_TO)
|
240 |
print("Translation complete")
|
241 |
+
|
242 |
audio_files = []
|
243 |
|
244 |
# Mapping speakers to voice variables
|
|
|
251 |
'SPEAKER_05': tts_voice05
|
252 |
}
|
253 |
|
254 |
+
for segment in tqdm(result_diarize['segments']):
|
255 |
|
256 |
text = segment['text']
|
257 |
start = segment['start']
|
|
|
268 |
filename = f"audio/{start}.ogg"
|
269 |
|
270 |
if speaker in speaker_to_voice and speaker_to_voice[speaker] != 'None':
|
271 |
+
make_voice_gradio(text, speaker_to_voice[speaker], filename, TRANSLATE_AUDIO_TO)
|
272 |
elif speaker == "SPEAKER_99":
|
273 |
try:
|
274 |
tts = gTTS(text, lang=TRANSLATE_AUDIO_TO)
|
|
|
277 |
except:
|
278 |
tts = gTTS('a', lang=TRANSLATE_AUDIO_TO)
|
279 |
tts.save(filename)
|
280 |
+
print('Error: Audio will be replaced.')
|
281 |
|
282 |
# duration
|
283 |
duration_true = end - start
|
|
|
287 |
porcentaje = duration_tts / duration_true
|
288 |
|
289 |
if porcentaje > 2.1:
|
290 |
+
porcentaje = 2.1
|
291 |
elif porcentaje <= 1.2 and porcentaje >= 0.8:
|
292 |
porcentaje = 1.0
|
293 |
elif porcentaje <= 0.79:
|
|
|
306 |
os.system("mv -f audio2/audio/*.ogg audio/")
|
307 |
|
308 |
os.system(f"rm {Output_name_file}")
|
|
|
309 |
create_translated_audio(result_diarize, audio_files, Output_name_file)
|
310 |
|
|
|
|
|
|
|
|
|
|
|
311 |
os.system(f"rm {mix_audio}")
|
312 |
+
|
313 |
+
# TYPE MIX AUDIO
|
314 |
+
if AUDIO_MIX_METHOD == 'Adjusting volumes and mixing audio':
|
315 |
+
# volume mix
|
316 |
+
os.system(f'ffmpeg -y -i {audio_wav} -i {Output_name_file} -filter_complex "[0:0]volume=0.15[a];[1:0]volume=1.90[b];[a][b]amix=inputs=2:duration=longest" -c:a libmp3lame {mix_audio}')
|
317 |
+
else:
|
318 |
+
try:
|
319 |
+
# background mix
|
320 |
+
os.system(f'ffmpeg -i {audio_wav} -i {Output_name_file} -filter_complex "[1:a]asplit=2[sc][mix];[0:a][sc]sidechaincompress=threshold=0.003:ratio=20[bg]; [bg][mix]amerge[final]" -map [final] {mix_audio}')
|
321 |
+
except:
|
322 |
+
# volume mix except
|
323 |
+
os.system(f'ffmpeg -y -i {audio_wav} -i {Output_name_file} -filter_complex "[0:0]volume=0.15[a];[1:0]volume=1.90[b];[a][b]amix=inputs=2:duration=longest" -c:a libmp3lame {mix_audio}')
|
324 |
+
|
325 |
os.system(f"rm {video_output}")
|
326 |
os.system(f"ffmpeg -i {OutputFile} -i {mix_audio} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output}")
|
|
|
|
|
|
|
327 |
|
328 |
+
return video_output
|
329 |
|
330 |
import sys
|
331 |
|
|
|
352 |
with open("output.log", "r") as f:
|
353 |
return f.read()
|
354 |
|
355 |
+
# max tts
|
356 |
+
MAX_TTS = 6
|
357 |
|
358 |
+
theme='Taithrah/Minimal'
|
359 |
+
|
360 |
+
with gr.Blocks(theme=theme) as demo:
|
361 |
gr.Markdown(title)
|
362 |
gr.Markdown(description)
|
|
|
363 |
|
364 |
+
#### video
|
365 |
with gr.Tab("Translate audio from video"):
|
366 |
with gr.Row():
|
367 |
with gr.Column():
|
368 |
video_input = gr.Video() # height=300,width=300
|
369 |
+
SOURCE_LANGUAGE = gr.Dropdown(['Automatic detection', 'English (en)', 'French (fr)', 'German (de)', 'Spanish (es)', 'Italian (it)', 'Japanese (ja)', 'Chinese (zh)', 'Dutch (nl)', 'Ukrainian (uk)', 'Portuguese (pt)'], value='Automatic detection',label = 'Source language', info="This is the original language of the video")
|
370 |
+
TRANSLATE_AUDIO_TO = gr.Dropdown(['English (en)', 'French (fr)', 'German (de)', 'Spanish (es)', 'Italian (it)', 'Japanese (ja)', 'Chinese (zh)', 'Dutch (nl)', 'Ukrainian (uk)', 'Portuguese (pt)'], value='English (en)',label = 'Translate audio to', info="Select the target language, and make sure to select the language corresponding to the speakers of the target language to avoid errors in the process.")
|
371 |
+
|
372 |
+
line_ = gr.HTML("<hr></h2>")
|
373 |
gr.Markdown("Select how many people are speaking in the video.")
|
374 |
+
min_speakers = gr.Slider(1, MAX_TTS, default=1, label="min_speakers", step=1, visible=False)
|
375 |
+
max_speakers = gr.Slider(1, MAX_TTS, value=2, step=1, label="Max speakers", interative=True)
|
|
|
376 |
gr.Markdown("Select the voice you want for each speaker.")
|
377 |
+
def submit(value):
|
378 |
+
visibility_dict = {
|
379 |
+
f'tts_voice{i:02d}': gr.update(visible=i < value) for i in range(6)
|
380 |
+
}
|
381 |
+
return [value for value in visibility_dict.values()]
|
382 |
+
tts_voice00 = gr.Dropdown(list_tts, value='en-AU-WilliamNeural-Male', label = 'TTS Speaker 1', visible=True, interactive= True)
|
383 |
+
tts_voice01 = gr.Dropdown(list_tts, value='en-CA-ClaraNeural-Female', label = 'TTS Speaker 2', visible=True, interactive= True)
|
384 |
+
tts_voice02 = gr.Dropdown(list_tts, value='en-GB-ThomasNeural-Male', label = 'TTS Speaker 3', visible=False, interactive= True)
|
385 |
+
tts_voice03 = gr.Dropdown(list_tts, value='en-GB-SoniaNeural-Female', label = 'TTS Speaker 4', visible=False, interactive= True)
|
386 |
+
tts_voice04 = gr.Dropdown(list_tts, value='en-NZ-MitchellNeural-Male', label = 'TTS Speaker 5', visible=False, interactive= True)
|
387 |
+
tts_voice05 = gr.Dropdown(list_tts, value='en-GB-MaisieNeural-Female', label = 'TTS Speaker 6', visible=False, interactive= True)
|
388 |
+
max_speakers.change(submit, max_speakers, [tts_voice00, tts_voice01, tts_voice02, tts_voice03, tts_voice04, tts_voice05])
|
389 |
+
|
390 |
+
with gr.Column():
|
391 |
+
with gr.Accordion("Advanced Settings", open=False):
|
392 |
+
|
393 |
+
AUDIO_MIX = gr.Dropdown(['Mixing audio with sidechain compression', 'Adjusting volumes and mixing audio'], value='Adjusting volumes and mixing audio', label = 'Audio Mixing Method', info="Mix original and translated audio files to create a customized, balanced output with two available mixing modes.")
|
394 |
+
|
395 |
+
gr.HTML("<hr></h2>")
|
396 |
+
gr.Markdown("Default configuration of Whisper.")
|
397 |
+
WHISPER_MODEL_SIZE = gr.inputs.Dropdown(['tiny', 'base', 'small', 'medium', 'large-v1', 'large-v2'], default=whisper_model_default, label="Whisper model")
|
398 |
+
batch_size = gr.inputs.Slider(1, 32, default=16, label="Batch size", step=1)
|
399 |
+
compute_type = gr.inputs.Dropdown(list_compute_type, default=compute_type_default, label="Compute type")
|
400 |
+
|
401 |
+
gr.HTML("<hr></h2>")
|
402 |
+
VIDEO_OUTPUT_NAME = gr.Textbox(label="Translated file name" ,value="video_output.mp4", info="The name of the output file")
|
403 |
+
PREVIEW = gr.Checkbox(label="Preview", info="Preview cuts the video to only 10 seconds for testing purposes. Please deactivate it to retrieve the full video duration.")
|
404 |
+
|
405 |
+
with gr.Column(variant='compact'):
|
406 |
with gr.Row():
|
407 |
video_button = gr.Button("TRANSLATE", )
|
408 |
with gr.Row():
|
409 |
video_output = gr.Video()
|
410 |
|
411 |
+
line_ = gr.HTML("<hr></h2>")
|
412 |
+
if os.getenv("YOUR_HF_TOKEN") == None or os.getenv("YOUR_HF_TOKEN") == "":
|
413 |
+
HFKEY = gr.Textbox(visible= True, label="HF Token", info="One important step is to accept the license agreement for using Pyannote. You need to have an account on Hugging Face and accept the license to use the models: https://huggingface.co/pyannote/speaker-diarization and https://huggingface.co/pyannote/segmentation. Get your KEY TOKEN here: https://hf.co/settings/tokens", placeholder="Token goes here...")
|
414 |
+
else:
|
415 |
+
HFKEY = gr.Textbox(visible= False, label="HF Token", info="One important step is to accept the license agreement for using Pyannote. You need to have an account on Hugging Face and accept the license to use the models: https://huggingface.co/pyannote/speaker-diarization and https://huggingface.co/pyannote/segmentation. Get your KEY TOKEN here: https://hf.co/settings/tokens", placeholder="Token goes here...")
|
416 |
|
417 |
gr.Examples(
|
418 |
examples=[
|
419 |
[
|
420 |
+
"./assets/Video_main.mp4",
|
421 |
+
"",
|
422 |
+
True,
|
423 |
"base",
|
424 |
16,
|
425 |
"float32",
|
426 |
+
"Spanish (es)",
|
427 |
+
"English (en)",
|
428 |
1,
|
429 |
2,
|
430 |
'en-AU-WilliamNeural-Male',
|
|
|
433 |
'en-GB-SoniaNeural-Female',
|
434 |
'en-NZ-MitchellNeural-Male',
|
435 |
'en-GB-MaisieNeural-Female',
|
436 |
+
"video_output.mp4",
|
437 |
+
'Adjusting volumes and mixing audio',
|
438 |
],
|
439 |
],
|
440 |
fn=translate_from_video,
|
441 |
inputs=[
|
442 |
video_input,
|
443 |
+
HFKEY,
|
444 |
+
PREVIEW,
|
445 |
+
WHISPER_MODEL_SIZE,
|
446 |
batch_size,
|
447 |
+
compute_type,
|
448 |
+
SOURCE_LANGUAGE,
|
449 |
+
TRANSLATE_AUDIO_TO,
|
450 |
min_speakers,
|
451 |
max_speakers,
|
452 |
tts_voice00,
|
|
|
455 |
tts_voice03,
|
456 |
tts_voice04,
|
457 |
tts_voice05,
|
458 |
+
VIDEO_OUTPUT_NAME,
|
459 |
+
AUDIO_MIX,
|
460 |
],
|
461 |
outputs=[video_output],
|
462 |
+
cache_examples=False,
|
463 |
)
|
464 |
|
465 |
+
### link
|
466 |
|
467 |
with gr.Tab("Translate audio from video link"):
|
468 |
with gr.Row():
|
469 |
with gr.Column():
|
470 |
+
|
471 |
+
blink_input = gr.Textbox(label="Media link.", info="Example: www.youtube.com/watch?v=g_9rPvbENUw", placeholder="URL goes here...")
|
472 |
+
# bSOURCE_LANGUAGE = gr.Dropdown(['Automatic detection', 'en', 'fr', 'de', 'es', 'it', 'ja', 'zh', 'nl', 'uk', 'pt'], value='en',label = 'Source language')
|
473 |
+
|
474 |
+
# gr.HTML("<hr></h2>")
|
475 |
+
|
476 |
+
# bHFKEY = gr.Textbox(label="HF Token", info="One important step is to accept the license agreement for using Pyannote. You need to have an account on Hugging Face and accept the license to use the models: https://huggingface.co/pyannote/speaker-diarization and https://huggingface.co/pyannote/segmentation. Get your KEY TOKEN here: https://hf.co/settings/tokens", placeholder="Token goes here...")
|
477 |
+
|
478 |
+
# gr.Markdown("Select the target language, and make sure to select the language corresponding to the speakers of the target language to avoid errors in the process.")
|
479 |
+
# bTRANSLATE_AUDIO_TO = gr.inputs.Dropdown(['en', 'fr', 'de', 'es', 'it', 'ja', 'zh', 'nl', 'uk', 'pt'], default='en',label = 'Translate audio to')
|
480 |
+
|
481 |
+
# gr.Markdown("Select how many people are speaking in the video.")
|
482 |
+
# bmin_speakers = gr.inputs.Slider(1, 6, default=1, label="min_speakers", step=1, )
|
483 |
+
# bmax_speakers = gr.inputs.Slider(1, 6, default=2, label="max_speakers",step=1)
|
484 |
+
|
485 |
+
# gr.Markdown("Select the voice you want for each speaker.")
|
486 |
+
# btts_voice00 = gr.inputs.Dropdown(list_tts, default='en-AU-WilliamNeural-Male', label = 'TTS Speaker 1')
|
487 |
+
# btts_voice01 = gr.inputs.Dropdown(list_tts, default='en-CA-ClaraNeural-Female', label = 'TTS Speaker 2')
|
488 |
+
# btts_voice02 = gr.inputs.Dropdown(list_tts, default='en-GB-ThomasNeural-Male', label = 'TTS Speaker 3')
|
489 |
+
# btts_voice03 = gr.inputs.Dropdown(list_tts, default='en-GB-SoniaNeural-Female', label = 'TTS Speaker 4')
|
490 |
+
# btts_voice04 = gr.inputs.Dropdown(list_tts, default='en-NZ-MitchellNeural-Male', label = 'TTS Speaker 5')
|
491 |
+
# btts_voice05 = gr.inputs.Dropdown(list_tts, default='en-GB-MaisieNeural-Female', label = 'TTS Speaker 6')
|
492 |
+
|
493 |
+
# with gr.Column():
|
494 |
+
# with gr.Accordion("Advanced Settings", open=False):
|
495 |
+
# gr.Markdown("Default configuration of Whisper.")
|
496 |
+
# bWHISPER_MODEL_SIZE = gr.inputs.Dropdown(['tiny', 'base', 'small', 'medium', 'large-v1', 'large-v2'], default=whisper_model_default, label="Whisper model")
|
497 |
+
# bbatch_size = gr.inputs.Slider(1, 32, default=16, label="Batch size", step=1)
|
498 |
+
# bcompute_type = gr.inputs.Dropdown(list_compute_type, default=compute_type_default, label="Compute type")
|
499 |
+
|
500 |
+
# bPREVIEW = gr.inputs.Checkbox(label="Preview cuts the video to only 10 seconds for testing purposes. Please deactivate it to retrieve the full video duration.")
|
501 |
+
# bVIDEO_OUTPUT_NAME = gr.Textbox(label="Translated file name" ,value="video_output.mp4")
|
502 |
+
|
503 |
+
bSOURCE_LANGUAGE = gr.Dropdown(['Automatic detection', 'English (en)', 'French (fr)', 'German (de)', 'Spanish (es)', 'Italian (it)', 'Japanese (ja)', 'Chinese (zh)', 'Dutch (nl)', 'Ukrainian (uk)', 'Portuguese (pt)'], value='Automatic detection',label = 'Source language', info="This is the original language of the video")
|
504 |
+
bTRANSLATE_AUDIO_TO = gr.Dropdown(['English (en)', 'French (fr)', 'German (de)', 'Spanish (es)', 'Italian (it)', 'Japanese (ja)', 'Chinese (zh)', 'Dutch (nl)', 'Ukrainian (uk)', 'Portuguese (pt)'], value='English (en)',label = 'Translate audio to', info="Select the target language, and make sure to select the language corresponding to the speakers of the target language to avoid errors in the process.")
|
505 |
+
|
506 |
+
bline_ = gr.HTML("<hr></h2>")
|
507 |
gr.Markdown("Select how many people are speaking in the video.")
|
508 |
+
bmin_speakers = gr.Slider(1, MAX_TTS, default=1, label="min_speakers", step=1, visible=False)
|
509 |
+
bmax_speakers = gr.Slider(1, MAX_TTS, value=2, step=1, label="Max speakers", interative=True)
|
|
|
510 |
gr.Markdown("Select the voice you want for each speaker.")
|
511 |
+
def bsubmit(value):
|
512 |
+
visibility_dict = {
|
513 |
+
f'btts_voice{i:02d}': gr.update(visible=i < value) for i in range(6)
|
514 |
+
}
|
515 |
+
return [value for value in visibility_dict.values()]
|
516 |
+
btts_voice00 = gr.Dropdown(list_tts, value='en-AU-WilliamNeural-Male', label = 'TTS Speaker 1', visible=True, interactive= True)
|
517 |
+
btts_voice01 = gr.Dropdown(list_tts, value='en-CA-ClaraNeural-Female', label = 'TTS Speaker 2', visible=True, interactive= True)
|
518 |
+
btts_voice02 = gr.Dropdown(list_tts, value='en-GB-ThomasNeural-Male', label = 'TTS Speaker 3', visible=False, interactive= True)
|
519 |
+
btts_voice03 = gr.Dropdown(list_tts, value='en-GB-SoniaNeural-Female', label = 'TTS Speaker 4', visible=False, interactive= True)
|
520 |
+
btts_voice04 = gr.Dropdown(list_tts, value='en-NZ-MitchellNeural-Male', label = 'TTS Speaker 5', visible=False, interactive= True)
|
521 |
+
btts_voice05 = gr.Dropdown(list_tts, value='en-GB-MaisieNeural-Female', label = 'TTS Speaker 6', visible=False, interactive= True)
|
522 |
+
bmax_speakers.change(bsubmit, bmax_speakers, [btts_voice00, btts_voice01, btts_voice02, btts_voice03, btts_voice04, btts_voice05])
|
523 |
+
|
524 |
+
|
525 |
+
with gr.Column():
|
526 |
+
with gr.Accordion("Advanced Settings", open=False):
|
527 |
+
|
528 |
+
bAUDIO_MIX = gr.Dropdown(['Mixing audio with sidechain compression', 'Adjusting volumes and mixing audio'], value='Adjusting volumes and mixing audio', label = 'Audio Mixing Method', info="Mix original and translated audio files to create a customized, balanced output with two available mixing modes.")
|
529 |
+
|
530 |
+
gr.HTML("<hr></h2>")
|
531 |
+
gr.Markdown("Default configuration of Whisper.")
|
532 |
+
bWHISPER_MODEL_SIZE = gr.inputs.Dropdown(['tiny', 'base', 'small', 'medium', 'large-v1', 'large-v2'], default=whisper_model_default, label="Whisper model")
|
533 |
+
bbatch_size = gr.inputs.Slider(1, 32, default=16, label="Batch size", step=1)
|
534 |
+
bcompute_type = gr.inputs.Dropdown(list_compute_type, default=compute_type_default, label="Compute type")
|
535 |
+
|
536 |
+
gr.HTML("<hr></h2>")
|
537 |
+
bVIDEO_OUTPUT_NAME = gr.Textbox(label="Translated file name" ,value="video_output.mp4", info="The name of the output file")
|
538 |
+
bPREVIEW = gr.Checkbox(label="Preview", info="Preview cuts the video to only 10 seconds for testing purposes. Please deactivate it to retrieve the full video duration.")
|
539 |
+
|
540 |
+
|
541 |
+
|
542 |
# text_button = gr.Button("Translate audio of video")
|
543 |
# link_output = gr.Video() #gr.outputs.File(label="Download!")
|
544 |
|
545 |
|
546 |
|
547 |
+
with gr.Column(variant='compact'):
|
548 |
with gr.Row():
|
549 |
text_button = gr.Button("TRANSLATE")
|
550 |
with gr.Row():
|
551 |
+
blink_output = gr.Video() #gr.outputs.File(label="Download!") # gr.Video()
|
552 |
+
|
553 |
+
|
554 |
+
bline_ = gr.HTML("<hr></h2>")
|
555 |
+
if os.getenv("YOUR_HF_TOKEN") == None or os.getenv("YOUR_HF_TOKEN") == "":
|
556 |
+
bHFKEY = gr.Textbox(visible= True, label="HF Token", info="One important step is to accept the license agreement for using Pyannote. You need to have an account on Hugging Face and accept the license to use the models: https://huggingface.co/pyannote/speaker-diarization and https://huggingface.co/pyannote/segmentation. Get your KEY TOKEN here: https://hf.co/settings/tokens", placeholder="Token goes here...")
|
557 |
+
else:
|
558 |
+
bHFKEY = gr.Textbox(visible= False, label="HF Token", info="One important step is to accept the license agreement for using Pyannote. You need to have an account on Hugging Face and accept the license to use the models: https://huggingface.co/pyannote/speaker-diarization and https://huggingface.co/pyannote/segmentation. Get your KEY TOKEN here: https://hf.co/settings/tokens", placeholder="Token goes here...")
|
559 |
|
560 |
gr.Examples(
|
561 |
examples=[
|
562 |
[
|
563 |
"https://www.youtube.com/watch?v=5ZeHtRKHl7Y",
|
564 |
+
"",
|
565 |
+
True,
|
566 |
"base",
|
567 |
16,
|
568 |
"float32",
|
569 |
+
"Japanese (ja)",
|
570 |
+
"English (en)",
|
571 |
1,
|
572 |
2,
|
573 |
'en-CA-ClaraNeural-Female',
|
|
|
576 |
'en-GB-SoniaNeural-Female',
|
577 |
'en-NZ-MitchellNeural-Male',
|
578 |
'en-GB-MaisieNeural-Female',
|
579 |
+
"video_output.mp4",
|
580 |
+
'Adjusting volumes and mixing audio',
|
581 |
],
|
582 |
],
|
583 |
fn=translate_from_video,
|
584 |
inputs=[
|
585 |
+
blink_input,
|
586 |
+
bHFKEY,
|
587 |
+
bPREVIEW,
|
588 |
+
bWHISPER_MODEL_SIZE,
|
589 |
bbatch_size,
|
590 |
+
bcompute_type,
|
591 |
+
bSOURCE_LANGUAGE,
|
592 |
+
bTRANSLATE_AUDIO_TO,
|
593 |
bmin_speakers,
|
594 |
bmax_speakers,
|
595 |
btts_voice00,
|
|
|
598 |
btts_voice03,
|
599 |
btts_voice04,
|
600 |
btts_voice05,
|
601 |
+
bVIDEO_OUTPUT_NAME,
|
602 |
+
bAUDIO_MIX
|
603 |
],
|
604 |
+
outputs=[blink_output],
|
605 |
+
cache_examples=False,
|
606 |
)
|
607 |
|
608 |
|
609 |
+
|
610 |
+
|
611 |
+
with gr.Tab("Help"):
|
612 |
+
gr.Markdown(news)
|
613 |
+
gr.Markdown(tutorial)
|
614 |
+
|
615 |
+
with gr.Accordion("Logs", open = False):
|
616 |
logs = gr.Textbox()
|
617 |
demo.load(read_logs, None, logs, every=1)
|
618 |
|
619 |
# run
|
620 |
video_button.click(translate_from_video, inputs=[
|
621 |
+
video_input,
|
622 |
+
HFKEY,
|
623 |
+
PREVIEW,
|
624 |
+
WHISPER_MODEL_SIZE,
|
625 |
batch_size,
|
626 |
+
compute_type,
|
627 |
+
SOURCE_LANGUAGE,
|
628 |
+
TRANSLATE_AUDIO_TO,
|
629 |
min_speakers,
|
630 |
max_speakers,
|
631 |
tts_voice00,
|
|
|
633 |
tts_voice02,
|
634 |
tts_voice03,
|
635 |
tts_voice04,
|
636 |
+
tts_voice05,
|
637 |
+
VIDEO_OUTPUT_NAME,
|
638 |
+
AUDIO_MIX,
|
639 |
+
], outputs=video_output)
|
640 |
text_button.click(translate_from_video, inputs=[
|
641 |
+
blink_input,
|
642 |
+
bHFKEY,
|
643 |
+
bPREVIEW,
|
644 |
+
bWHISPER_MODEL_SIZE,
|
645 |
bbatch_size,
|
646 |
+
bcompute_type,
|
647 |
+
bSOURCE_LANGUAGE,
|
648 |
+
bTRANSLATE_AUDIO_TO,
|
649 |
bmin_speakers,
|
650 |
bmax_speakers,
|
651 |
btts_voice00,
|
|
|
653 |
btts_voice02,
|
654 |
btts_voice03,
|
655 |
btts_voice04,
|
656 |
+
btts_voice05,
|
657 |
+
bVIDEO_OUTPUT_NAME,
|
658 |
+
bAUDIO_MIX,
|
659 |
+
], outputs=blink_output)
|
660 |
|
661 |
demo.launch(enable_queue=True)
|
662 |
+
#demo.launch()
|
|
|
|
|
|