r3gm commited on
Commit
431084f
β€’
1 Parent(s): 5146f4b

add new ui and options for demo

Browse files
Files changed (1) hide show
  1. app.py +347 -158
app.py CHANGED
@@ -1,6 +1,4 @@
1
-
2
- #os.system("git clone https://github.com/R3gm/SoniTranslate")
3
- # pip install -r requirements.txt
4
  import numpy as np
5
  import gradio as gr
6
  import whisperx
@@ -17,45 +15,41 @@ import os
17
  from soni_translate.audio_segments import create_translated_audio
18
  from soni_translate.text_to_speech import make_voice_gradio
19
  from soni_translate.translate_segments import translate_text
20
- #from soni_translate import test
21
 
22
  title = "<center><strong><font size='7'>πŸ“½οΈ SoniTranslate 🈷️</font></strong></center>"
23
 
24
  news = """ ## πŸ“– News
25
- πŸ”₯ 2023/07/01: Support (Thanks for [text](https://github.com)).
26
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
- description = """ ## Translate the audio of a video content from one language to another while preserving synchronization.
29
 
30
 
31
- This is a demo on Github project πŸ“½οΈ [SoniTranslate](https://github.com/R3gm/SoniTranslate).
32
-
33
- πŸ“Ό You can upload a video or provide a video link. The generation is **limited to 10 seconds** to prevent errors with the queue in cpu. If you use a GPU, you won't have any of these limitations.
34
-
35
- πŸš€ For **translate a video of any duration** and faster results, you can use the Colab notebook with GPU.
36
 
37
- [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://github.com/R3gm/SoniTranslate/blob/main/SoniTranslate_Colab.ipynb)
38
-
39
- """
40
 
41
- tutorial = """ # πŸ”° Instructions for use.
42
-
43
- 1. Upload a video on the first tab or use a video link on the second tab.
44
-
45
- 2. Choose the language in which you want to translate the video.
46
-
47
- 3. Specify the number of people speaking in the video and assign each one a text-to-speech voice suitable for the translation language.
48
-
49
- 4. Press the 'Translate' button to obtain the results.
50
-
51
- """
52
 
 
 
 
 
53
 
54
- if not os.path.exists('audio'):
55
- os.makedirs('audio')
56
 
57
- if not os.path.exists('audio2/audio'):
58
- os.makedirs('audio2/audio')
59
 
60
  # Check GPU
61
  if torch.cuda.is_available():
@@ -70,74 +64,151 @@ else:
70
  whisper_model_default = 'base'
71
  print('Working in: ', device)
72
 
 
73
 
74
- # Download an audio
75
- #url = "https://www.youtube.com/watch?v=Rdi-SNhe2v4"
76
-
77
- ### INIT
78
- list_tts = ['af-ZA-AdriNeural-Female', 'af-ZA-WillemNeural-Male', 'am-ET-AmehaNeural-Male', 'am-ET-MekdesNeural-Female', 'ar-AE-FatimaNeural-Female', 'ar-AE-HamdanNeural-Male', 'ar-BH-AliNeural-Male', 'ar-BH-LailaNeural-Female', 'ar-DZ-AminaNeural-Female', 'ar-DZ-IsmaelNeural-Male', 'ar-EG-SalmaNeural-Female', 'ar-EG-ShakirNeural-Male', 'ar-IQ-BasselNeural-Male', 'ar-IQ-RanaNeural-Female', 'ar-JO-SanaNeural-Female', 'ar-JO-TaimNeural-Male', 'ar-KW-FahedNeural-Male', 'ar-KW-NouraNeural-Female', 'ar-LB-LaylaNeural-Female', 'ar-LB-RamiNeural-Male', 'ar-LY-ImanNeural-Female', 'ar-LY-OmarNeural-Male', 'ar-MA-JamalNeural-Male', 'ar-MA-MounaNeural-Female', 'ar-OM-AbdullahNeural-Male', 'ar-OM-AyshaNeural-Female', 'ar-QA-AmalNeural-Female', 'ar-QA-MoazNeural-Male', 'ar-SA-HamedNeural-Male', 'ar-SA-ZariyahNeural-Female', 'ar-SY-AmanyNeural-Female', 'ar-SY-LaithNeural-Male', 'ar-TN-HediNeural-Male', 'ar-TN-ReemNeural-Female', 'ar-YE-MaryamNeural-Female', 'ar-YE-SalehNeural-Male', 'az-AZ-BabekNeural-Male', 'az-AZ-BanuNeural-Female', 'bg-BG-BorislavNeural-Male', 'bg-BG-KalinaNeural-Female', 'bn-BD-NabanitaNeural-Female', 'bn-BD-PradeepNeural-Male', 'bn-IN-BashkarNeural-Male', 'bn-IN-TanishaaNeural-Female', 'bs-BA-GoranNeural-Male', 'bs-BA-VesnaNeural-Female', 'ca-ES-EnricNeural-Male', 'ca-ES-JoanaNeural-Female', 'cs-CZ-AntoninNeural-Male', 'cs-CZ-VlastaNeural-Female', 'cy-GB-AledNeural-Male', 'cy-GB-NiaNeural-Female', 'da-DK-ChristelNeural-Female', 'da-DK-JeppeNeural-Male', 'de-AT-IngridNeural-Female', 'de-AT-JonasNeural-Male', 'de-CH-JanNeural-Male', 'de-CH-LeniNeural-Female', 'de-DE-AmalaNeural-Female', 'de-DE-ConradNeural-Male', 'de-DE-KatjaNeural-Female', 'de-DE-KillianNeural-Male', 'el-GR-AthinaNeural-Female', 'el-GR-NestorasNeural-Male', 'en-AU-NatashaNeural-Female', 'en-AU-WilliamNeural-Male', 'en-CA-ClaraNeural-Female', 'en-CA-LiamNeural-Male', 'en-GB-LibbyNeural-Female', 'en-GB-MaisieNeural-Female', 'en-GB-RyanNeural-Male', 'en-GB-SoniaNeural-Female', 'en-GB-ThomasNeural-Male', 'en-HK-SamNeural-Male', 'en-HK-YanNeural-Female', 'en-IE-ConnorNeural-Male', 'en-IE-EmilyNeural-Female', 'en-IN-NeerjaExpressiveNeural-Female', 'en-IN-NeerjaNeural-Female', 'en-IN-PrabhatNeural-Male', 'en-KE-AsiliaNeural-Female', 'en-KE-ChilembaNeural-Male', 'en-NG-AbeoNeural-Male', 'en-NG-EzinneNeural-Female', 'en-NZ-MitchellNeural-Male', 'en-NZ-MollyNeural-Female', 'en-PH-JamesNeural-Male', 'en-PH-RosaNeural-Female', 'en-SG-LunaNeural-Female', 'en-SG-WayneNeural-Male', 'en-TZ-ElimuNeural-Male', 'en-TZ-ImaniNeural-Female', 'en-US-AnaNeural-Female', 'en-US-AriaNeural-Female', 'en-US-ChristopherNeural-Male', 'en-US-EricNeural-Male', 'en-US-GuyNeural-Male', 'en-US-JennyNeural-Female', 'en-US-MichelleNeural-Female', 'en-US-RogerNeural-Male', 'en-US-SteffanNeural-Male', 'en-ZA-LeahNeural-Female', 'en-ZA-LukeNeural-Male', 'es-AR-ElenaNeural-Female', 'es-AR-TomasNeural-Male', 'es-BO-MarceloNeural-Male', 'es-BO-SofiaNeural-Female', 'es-CL-CatalinaNeural-Female', 'es-CL-LorenzoNeural-Male', 'es-CO-GonzaloNeural-Male', 'es-CO-SalomeNeural-Female', 'es-CR-JuanNeural-Male', 'es-CR-MariaNeural-Female', 'es-CU-BelkysNeural-Female', 'es-CU-ManuelNeural-Male', 'es-DO-EmilioNeural-Male', 'es-DO-RamonaNeural-Female', 'es-EC-AndreaNeural-Female', 'es-EC-LuisNeural-Male', 'es-ES-AlvaroNeural-Male', 'es-ES-ElviraNeural-Female', 'es-GQ-JavierNeural-Male', 'es-GQ-TeresaNeural-Female', 'es-GT-AndresNeural-Male', 'es-GT-MartaNeural-Female', 'es-HN-CarlosNeural-Male', 'es-HN-KarlaNeural-Female', 'es-MX-DaliaNeural-Female', 'es-MX-JorgeNeural-Male', 'es-NI-FedericoNeural-Male', 'es-NI-YolandaNeural-Female', 'es-PA-MargaritaNeural-Female', 'es-PA-RobertoNeural-Male', 'es-PE-AlexNeural-Male', 'es-PE-CamilaNeural-Female', 'es-PR-KarinaNeural-Female', 'es-PR-VictorNeural-Male', 'es-PY-MarioNeural-Male', 'es-PY-TaniaNeural-Female', 'es-SV-LorenaNeural-Female', 'es-SV-RodrigoNeural-Male', 'es-US-AlonsoNeural-Male', 'es-US-PalomaNeural-Female', 'es-UY-MateoNeural-Male', 'es-UY-ValentinaNeural-Female', 'es-VE-PaolaNeural-Female', 'es-VE-SebastianNeural-Male', 'et-EE-AnuNeural-Female', 'et-EE-KertNeural-Male', 'fa-IR-DilaraNeural-Female', 'fa-IR-FaridNeural-Male', 'fi-FI-HarriNeural-Male', 'fi-FI-NooraNeural-Female', 'fil-PH-AngeloNeural-Male', 'fil-PH-BlessicaNeural-Female', 'fr-BE-CharlineNeural-Female', 'fr-BE-GerardNeural-Male', 'fr-CA-AntoineNeural-Male', 'fr-CA-JeanNeural-Male', 'fr-CA-SylvieNeural-Female', 'fr-CH-ArianeNeural-Female', 'fr-CH-FabriceNeural-Male', 'fr-FR-DeniseNeural-Female', 'fr-FR-EloiseNeural-Female', 'fr-FR-HenriNeural-Male', 'ga-IE-ColmNeural-Male', 'ga-IE-OrlaNeural-Female', 'gl-ES-RoiNeural-Male', 'gl-ES-SabelaNeural-Female', 'gu-IN-DhwaniNeural-Female', 'gu-IN-NiranjanNeural-Male', 'he-IL-AvriNeural-Male', 'he-IL-HilaNeural-Female', 'hi-IN-MadhurNeural-Male', 'hi-IN-SwaraNeural-Female', 'hr-HR-GabrijelaNeural-Female', 'hr-HR-SreckoNeural-Male', 'hu-HU-NoemiNeural-Female', 'hu-HU-TamasNeural-Male', 'id-ID-ArdiNeural-Male', 'id-ID-GadisNeural-Female', 'is-IS-GudrunNeural-Female', 'is-IS-GunnarNeural-Male', 'it-IT-DiegoNeural-Male', 'it-IT-ElsaNeural-Female', 'it-IT-IsabellaNeural-Female', 'ja-JP-KeitaNeural-Male', 'ja-JP-NanamiNeural-Female', 'jv-ID-DimasNeural-Male', 'jv-ID-SitiNeural-Female', 'ka-GE-EkaNeural-Female', 'ka-GE-GiorgiNeural-Male', 'kk-KZ-AigulNeural-Female', 'kk-KZ-DauletNeural-Male', 'km-KH-PisethNeural-Male', 'km-KH-SreymomNeural-Female', 'kn-IN-GaganNeural-Male', 'kn-IN-SapnaNeural-Female', 'ko-KR-InJoonNeural-Male', 'ko-KR-SunHiNeural-Female', 'lo-LA-ChanthavongNeural-Male', 'lo-LA-KeomanyNeural-Female', 'lt-LT-LeonasNeural-Male', 'lt-LT-OnaNeural-Female', 'lv-LV-EveritaNeural-Female', 'lv-LV-NilsNeural-Male', 'mk-MK-AleksandarNeural-Male', 'mk-MK-MarijaNeural-Female', 'ml-IN-MidhunNeural-Male', 'ml-IN-SobhanaNeural-Female', 'mn-MN-BataaNeural-Male', 'mn-MN-YesuiNeural-Female', 'mr-IN-AarohiNeural-Female', 'mr-IN-ManoharNeural-Male', 'ms-MY-OsmanNeural-Male', 'ms-MY-YasminNeural-Female', 'mt-MT-GraceNeural-Female', 'mt-MT-JosephNeural-Male', 'my-MM-NilarNeural-Female', 'my-MM-ThihaNeural-Male', 'nb-NO-FinnNeural-Male', 'nb-NO-PernilleNeural-Female', 'ne-NP-HemkalaNeural-Female', 'ne-NP-SagarNeural-Male', 'nl-BE-ArnaudNeural-Male', 'nl-BE-DenaNeural-Female', 'nl-NL-ColetteNeural-Female', 'nl-NL-FennaNeural-Female', 'nl-NL-MaartenNeural-Male', 'pl-PL-MarekNeural-Male', 'pl-PL-ZofiaNeural-Female', 'ps-AF-GulNawazNeural-Male', 'ps-AF-LatifaNeural-Female', 'pt-BR-AntonioNeural-Male', 'pt-BR-FranciscaNeural-Female', 'pt-PT-DuarteNeural-Male', 'pt-PT-RaquelNeural-Female', 'ro-RO-AlinaNeural-Female', 'ro-RO-EmilNeural-Male', 'ru-RU-DmitryNeural-Male', 'ru-RU-SvetlanaNeural-Female', 'si-LK-SameeraNeural-Male', 'si-LK-ThiliniNeural-Female', 'sk-SK-LukasNeural-Male', 'sk-SK-ViktoriaNeural-Female', 'sl-SI-PetraNeural-Female', 'sl-SI-RokNeural-Male', 'so-SO-MuuseNeural-Male', 'so-SO-UbaxNeural-Female', 'sq-AL-AnilaNeural-Female', 'sq-AL-IlirNeural-Male', 'sr-RS-NicholasNeural-Male', 'sr-RS-SophieNeural-Female', 'su-ID-JajangNeural-Male', 'su-ID-TutiNeural-Female', 'sv-SE-MattiasNeural-Male', 'sv-SE-SofieNeural-Female', 'sw-KE-RafikiNeural-Male', 'sw-KE-ZuriNeural-Female', 'sw-TZ-DaudiNeural-Male', 'sw-TZ-RehemaNeural-Female', 'ta-IN-PallaviNeural-Female', 'ta-IN-ValluvarNeural-Male', 'ta-LK-KumarNeural-Male', 'ta-LK-SaranyaNeural-Female', 'ta-MY-KaniNeural-Female', 'ta-MY-SuryaNeural-Male', 'ta-SG-AnbuNeural-Male', 'ta-SG-VenbaNeural-Female', 'te-IN-MohanNeural-Male', 'te-IN-ShrutiNeural-Female', 'th-TH-NiwatNeural-Male', 'th-TH-PremwadeeNeural-Female', 'tr-TR-AhmetNeural-Male', 'tr-TR-EmelNeural-Female', 'uk-UA-OstapNeural-Male', 'uk-UA-PolinaNeural-Female', 'ur-IN-GulNeural-Female', 'ur-IN-SalmanNeural-Male', 'ur-PK-AsadNeural-Male', 'ur-PK-UzmaNeural-Female', 'uz-UZ-MadinaNeural-Female', 'uz-UZ-SardorNeural-Male', 'vi-VN-HoaiMyNeural-Female', 'vi-VN-NamMinhNeural-Male', 'zh-CN-XiaoxiaoNeural-Female', 'zh-CN-XiaoyiNeural-Female', 'zh-CN-YunjianNeural-Male', 'zh-CN-YunxiNeural-Male', 'zh-CN-YunxiaNeural-Male', 'zh-CN-YunyangNeural-Male', 'zh-CN-liaoning-XiaobeiNeural-Female', 'zh-CN-shaanxi-XiaoniNeural-Female']
79
-
80
-
81
- def translate_from_video(video, WHISPER_MODEL_SIZE, batch_size, compute_type,
82
  TRANSLATE_AUDIO_TO, min_speakers, max_speakers,
83
  tts_voice00, tts_voice01,tts_voice02,tts_voice03,tts_voice04,tts_voice05):
84
 
85
  YOUR_HF_TOKEN = os.getenv("My_hf_token")
86
 
87
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  OutputFile = 'Video.mp4'
89
  audio_wav = "audio.wav"
90
- Output_name_file = "audio_dub_solo.wav"
91
  mix_audio = "audio_mix.mp3"
92
- video_output = "diar_output.mp4"
93
-
94
- os.system(f"rm {Output_name_file}")
95
  os.system("rm Video.mp4")
96
- #os.system("rm diar_output.mp4")
97
  os.system("rm audio.wav")
98
-
99
 
100
  if os.path.exists(video):
101
- print(f"### Start Video ###")
102
- if device == 'cpu':
103
- # max 1 minute in cpu
104
- print('10 s. Limited for CPU ')
105
  os.system(f'ffmpeg -y -i "{video}" -ss 00:00:20 -t 00:00:10 -c:v libx264 -c:a aac -strict experimental Video.mp4')
106
  else:
107
  os.system(f'ffmpeg -y -i "{video}" -c:v libx264 -c:a aac -strict experimental Video.mp4')
108
-
109
  os.system("ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav")
110
  else:
111
- print(f"### Start {video} ###")
112
- if device == 'cpu':
113
- # max 1 minute in cpu
114
- print('10 s. Limited for CPU ')
115
  #https://github.com/yt-dlp/yt-dlp/issues/2220
116
  mp4_ = f'yt-dlp -f "mp4" --downloader ffmpeg --downloader-args "ffmpeg_i: -ss 00:00:20 -t 00:00:10" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
117
- wav_ = "ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 1 audio.wav"
 
 
118
  else:
119
  mp4_ = f'yt-dlp -f "mp4" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
120
  wav_ = f'python -m yt_dlp --output {audio_wav} --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --extract-audio --audio-format wav {video}'
121
-
122
- os.system(mp4_)
123
- os.system(wav_)
 
 
 
 
 
 
 
 
 
 
124
 
125
  print("Set file complete.")
126
-
 
 
127
  # 1. Transcribe with original whisper (batched)
128
  model = whisperx.load_model(
129
  WHISPER_MODEL_SIZE,
130
  device,
131
- compute_type=compute_type
 
132
  )
133
  audio = whisperx.load_audio(audio_wav)
134
  result = model.transcribe(audio, batch_size=batch_size)
135
  gc.collect(); torch.cuda.empty_cache(); del model
136
  print("Transcript complete")
137
-
138
  # 2. Align whisper output
139
  model_a, metadata = whisperx.load_align_model(
140
- language_code=result["language"],
141
  device=device
142
  )
143
  result = whisperx.align(
@@ -150,7 +221,11 @@ def translate_from_video(video, WHISPER_MODEL_SIZE, batch_size, compute_type,
150
  )
151
  gc.collect(); torch.cuda.empty_cache(); del model_a
152
  print("Align complete")
153
-
 
 
 
 
154
  # 3. Assign speaker labels
155
  diarize_model = whisperx.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device)
156
  diarize_segments = diarize_model(
@@ -160,10 +235,10 @@ def translate_from_video(video, WHISPER_MODEL_SIZE, batch_size, compute_type,
160
  result_diarize = whisperx.assign_word_speakers(diarize_segments, result)
161
  gc.collect(); torch.cuda.empty_cache(); del diarize_model
162
  print("Diarize complete")
163
-
164
  result_diarize['segments'] = translate_text(result_diarize['segments'], TRANSLATE_AUDIO_TO)
165
  print("Translation complete")
166
-
167
  audio_files = []
168
 
169
  # Mapping speakers to voice variables
@@ -176,7 +251,7 @@ def translate_from_video(video, WHISPER_MODEL_SIZE, batch_size, compute_type,
176
  'SPEAKER_05': tts_voice05
177
  }
178
 
179
- for segment in result_diarize['segments']:
180
 
181
  text = segment['text']
182
  start = segment['start']
@@ -193,7 +268,7 @@ def translate_from_video(video, WHISPER_MODEL_SIZE, batch_size, compute_type,
193
  filename = f"audio/{start}.ogg"
194
 
195
  if speaker in speaker_to_voice and speaker_to_voice[speaker] != 'None':
196
- make_voice_gradio(text, speaker_to_voice[speaker], filename)
197
  elif speaker == "SPEAKER_99":
198
  try:
199
  tts = gTTS(text, lang=TRANSLATE_AUDIO_TO)
@@ -202,7 +277,7 @@ def translate_from_video(video, WHISPER_MODEL_SIZE, batch_size, compute_type,
202
  except:
203
  tts = gTTS('a', lang=TRANSLATE_AUDIO_TO)
204
  tts.save(filename)
205
- print('ERROR AUDIO GTTS')
206
 
207
  # duration
208
  duration_true = end - start
@@ -212,7 +287,7 @@ def translate_from_video(video, WHISPER_MODEL_SIZE, batch_size, compute_type,
212
  porcentaje = duration_tts / duration_true
213
 
214
  if porcentaje > 2.1:
215
- porcentaje = 2.1
216
  elif porcentaje <= 1.2 and porcentaje >= 0.8:
217
  porcentaje = 1.0
218
  elif porcentaje <= 0.79:
@@ -231,25 +306,26 @@ def translate_from_video(video, WHISPER_MODEL_SIZE, batch_size, compute_type,
231
  os.system("mv -f audio2/audio/*.ogg audio/")
232
 
233
  os.system(f"rm {Output_name_file}")
234
-
235
  create_translated_audio(result_diarize, audio_files, Output_name_file)
236
 
237
- os.system("rm audio_dub_stereo.wav")
238
- os.system("ffmpeg -i audio_dub_solo.wav -ac 1 audio_dub_stereo.wav")
239
-
240
- #os.system(f"ffmpeg -i Video.mp4 -i {Output_name_file} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output}")
241
-
242
  os.system(f"rm {mix_audio}")
243
- #os.system(f'''ffmpeg -i {audio_wav} -i audio_dub_stereo.wav -filter_complex "[1:a]asplit=2[sc][mix];[0:a][sc]sidechaincompress=threshold=0.003:ratio=20[bg]; [bg][mix]amerge[final]" -map [final] {mix_audio}''')
244
- #os.system(f'ffmpeg -y -i {audio_wav} -i audio_dub_stereo.wav -filter_complex "[0:0][1:0] amix=inputs=2:duration=longest" -c:a libmp3lame {mix_audio}')
245
- os.system(f'ffmpeg -y -i audio.wav -i audio_dub_stereo.wav -filter_complex "[0:0]volume=0.15[a];[1:0]volume=1.90[b];[a][b]amix=inputs=2:duration=longest" -c:a libmp3lame {mix_audio}')
246
-
 
 
 
 
 
 
 
 
 
247
  os.system(f"rm {video_output}")
248
  os.system(f"ffmpeg -i {OutputFile} -i {mix_audio} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output}")
249
-
250
- return video_output
251
-
252
 
 
253
 
254
  import sys
255
 
@@ -276,52 +352,79 @@ def read_logs():
276
  with open("output.log", "r") as f:
277
  return f.read()
278
 
 
 
279
 
280
- with gr.Blocks() as demo:
 
 
281
  gr.Markdown(title)
282
  gr.Markdown(description)
283
- gr.Markdown(tutorial)
284
 
 
285
  with gr.Tab("Translate audio from video"):
286
  with gr.Row():
287
  with gr.Column():
288
  video_input = gr.Video() # height=300,width=300
289
-
290
- gr.Markdown("Select the target language, and make sure to select the language corresponding to the speakers of the target language to avoid errors in the process.")
291
- TRANSLATE_AUDIO_TO = gr.inputs.Dropdown(['en', 'fr', 'de', 'es', 'it', 'ja', 'zh', 'nl', 'uk', 'pt'], default='en',label = 'Translate audio to')
292
-
293
  gr.Markdown("Select how many people are speaking in the video.")
294
- min_speakers = gr.inputs.Slider(1, 6, default=1, label="min_speakers", step=1)
295
- max_speakers = gr.inputs.Slider(1, 6, default=2, label="max_speakers",step=1)
296
-
297
  gr.Markdown("Select the voice you want for each speaker.")
298
- tts_voice00 = gr.inputs.Dropdown(list_tts, default='en-AU-WilliamNeural-Male', label = 'TTS Speaker 1')
299
- tts_voice01 = gr.inputs.Dropdown(list_tts, default='en-CA-ClaraNeural-Female', label = 'TTS Speaker 2')
300
- tts_voice02 = gr.inputs.Dropdown(list_tts, default='en-GB-ThomasNeural-Male', label = 'TTS Speaker 3')
301
- tts_voice03 = gr.inputs.Dropdown(list_tts, default='en-GB-SoniaNeural-Female', label = 'TTS Speaker 4')
302
- tts_voice04 = gr.inputs.Dropdown(list_tts, default='en-NZ-MitchellNeural-Male', label = 'TTS Speaker 5')
303
- tts_voice05 = gr.inputs.Dropdown(list_tts, default='en-GB-MaisieNeural-Female', label = 'TTS Speaker 6')
304
-
305
- gr.Markdown("Default configuration of Whisper.")
306
- WHISPER_MODEL_SIZE = gr.inputs.Dropdown(['tiny', 'base', 'small', 'medium', 'large-v1', 'large-v2'], default=whisper_model_default, label="Whisper model")
307
- batch_size = gr.inputs.Slider(1, 32, default=16, label="Batch size", step=1)
308
- compute_type = gr.inputs.Dropdown(list_compute_type, default=compute_type_default, label="Compute type")
309
-
310
- with gr.Column(variant='compact'):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
  with gr.Row():
312
  video_button = gr.Button("TRANSLATE", )
313
  with gr.Row():
314
  video_output = gr.Video()
315
 
 
 
 
 
 
316
 
317
  gr.Examples(
318
  examples=[
319
  [
320
- "./assets/Video_subtitled.mp4",
 
 
321
  "base",
322
  16,
323
  "float32",
324
- "en",
 
325
  1,
326
  2,
327
  'en-AU-WilliamNeural-Male',
@@ -330,15 +433,20 @@ with gr.Blocks() as demo:
330
  'en-GB-SoniaNeural-Female',
331
  'en-NZ-MitchellNeural-Male',
332
  'en-GB-MaisieNeural-Female',
 
 
333
  ],
334
  ],
335
  fn=translate_from_video,
336
  inputs=[
337
  video_input,
338
- WHISPER_MODEL_SIZE,
 
 
339
  batch_size,
340
- compute_type,
341
- TRANSLATE_AUDIO_TO,
 
342
  min_speakers,
343
  max_speakers,
344
  tts_voice00,
@@ -347,58 +455,119 @@ with gr.Blocks() as demo:
347
  tts_voice03,
348
  tts_voice04,
349
  tts_voice05,
 
 
350
  ],
351
  outputs=[video_output],
352
- cache_examples=True,
353
  )
354
 
 
355
 
356
  with gr.Tab("Translate audio from video link"):
357
  with gr.Row():
358
  with gr.Column():
359
-
360
- link_input = gr.Textbox(label="Media link. Example: www.youtube.com/watch?v=g_9rPvbENUw", placeholder="URL goes here...")
361
- #filename = gr.Textbox(label="File name", placeholder="best-vid")
362
-
363
- gr.Markdown("Select the target language, and make sure to select the language corresponding to the speakers of the target language to avoid errors in the process.")
364
- bTRANSLATE_AUDIO_TO = gr.inputs.Dropdown(['en', 'fr', 'de', 'es', 'it', 'ja', 'zh', 'nl', 'uk', 'pt'], default='en',label = 'Translate audio to')
365
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
366
  gr.Markdown("Select how many people are speaking in the video.")
367
- bmin_speakers = gr.inputs.Slider(1, 6, default=1, label="min_speakers", step=1)
368
- bmax_speakers = gr.inputs.Slider(1, 6, default=2, label="max_speakers",step=1)
369
-
370
  gr.Markdown("Select the voice you want for each speaker.")
371
- btts_voice00 = gr.inputs.Dropdown(list_tts, default='en-AU-WilliamNeural-Male', label = 'TTS Speaker 1')
372
- btts_voice01 = gr.inputs.Dropdown(list_tts, default='en-CA-ClaraNeural-Female', label = 'TTS Speaker 2')
373
- btts_voice02 = gr.inputs.Dropdown(list_tts, default='en-GB-ThomasNeural-Male', label = 'TTS Speaker 3')
374
- btts_voice03 = gr.inputs.Dropdown(list_tts, default='en-GB-SoniaNeural-Female', label = 'TTS Speaker 4')
375
- btts_voice04 = gr.inputs.Dropdown(list_tts, default='en-NZ-MitchellNeural-Male', label = 'TTS Speaker 5')
376
- btts_voice05 = gr.inputs.Dropdown(list_tts, default='en-GB-MaisieNeural-Female', label = 'TTS Speaker 6')
377
-
378
- gr.Markdown("Default configuration of Whisper.")
379
- bWHISPER_MODEL_SIZE = gr.inputs.Dropdown(['tiny', 'base', 'small', 'medium', 'large-v1', 'large-v2'], default=whisper_model_default, label="Whisper model")
380
- bbatch_size = gr.inputs.Slider(1, 32, default=16, label="Batch size", step=1)
381
- bcompute_type = gr.inputs.Dropdown(list_compute_type, default=compute_type_default, label="Compute type")
382
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
383
  # text_button = gr.Button("Translate audio of video")
384
  # link_output = gr.Video() #gr.outputs.File(label="Download!")
385
 
386
 
387
 
388
- with gr.Column(variant='compact'):
389
  with gr.Row():
390
  text_button = gr.Button("TRANSLATE")
391
  with gr.Row():
392
- link_output = gr.Video() #gr.outputs.File(label="Download!") # gr.Video()
 
 
 
 
 
 
 
393
 
394
  gr.Examples(
395
  examples=[
396
  [
397
  "https://www.youtube.com/watch?v=5ZeHtRKHl7Y",
 
 
398
  "base",
399
  16,
400
  "float32",
401
- "en",
 
402
  1,
403
  2,
404
  'en-CA-ClaraNeural-Female',
@@ -407,15 +576,20 @@ with gr.Blocks() as demo:
407
  'en-GB-SoniaNeural-Female',
408
  'en-NZ-MitchellNeural-Male',
409
  'en-GB-MaisieNeural-Female',
 
 
410
  ],
411
  ],
412
  fn=translate_from_video,
413
  inputs=[
414
- link_input,
415
- bWHISPER_MODEL_SIZE,
 
 
416
  bbatch_size,
417
- bcompute_type,
418
- bTRANSLATE_AUDIO_TO,
 
419
  bmin_speakers,
420
  bmax_speakers,
421
  btts_voice00,
@@ -424,24 +598,34 @@ with gr.Blocks() as demo:
424
  btts_voice03,
425
  btts_voice04,
426
  btts_voice05,
 
 
427
  ],
428
- outputs=[link_output],
429
- cache_examples=True,
430
  )
431
 
432
 
433
-
434
- with gr.Accordion("Logs"):
 
 
 
 
 
435
  logs = gr.Textbox()
436
  demo.load(read_logs, None, logs, every=1)
437
 
438
  # run
439
  video_button.click(translate_from_video, inputs=[
440
- video_input,
441
- WHISPER_MODEL_SIZE,
 
 
442
  batch_size,
443
- compute_type,
444
- TRANSLATE_AUDIO_TO,
 
445
  min_speakers,
446
  max_speakers,
447
  tts_voice00,
@@ -449,13 +633,19 @@ with gr.Blocks() as demo:
449
  tts_voice02,
450
  tts_voice03,
451
  tts_voice04,
452
- tts_voice05,], outputs=video_output)
 
 
 
453
  text_button.click(translate_from_video, inputs=[
454
- link_input,
455
- bWHISPER_MODEL_SIZE,
 
 
456
  bbatch_size,
457
- bcompute_type,
458
- bTRANSLATE_AUDIO_TO,
 
459
  bmin_speakers,
460
  bmax_speakers,
461
  btts_voice00,
@@ -463,11 +653,10 @@ with gr.Blocks() as demo:
463
  btts_voice02,
464
  btts_voice03,
465
  btts_voice04,
466
- btts_voice05,], outputs=link_output)
467
-
 
 
468
 
469
  demo.launch(enable_queue=True)
470
-
471
-
472
-
473
-
 
1
+ #%cd SoniTranslate
 
 
2
  import numpy as np
3
  import gradio as gr
4
  import whisperx
 
15
  from soni_translate.audio_segments import create_translated_audio
16
  from soni_translate.text_to_speech import make_voice_gradio
17
  from soni_translate.translate_segments import translate_text
 
18
 
19
  title = "<center><strong><font size='7'>πŸ“½οΈ SoniTranslate 🈷️</font></strong></center>"
20
 
21
  news = """ ## πŸ“– News
22
+ πŸ”₯ 2023/07/26: new UI and mix options add.
23
+ """
24
+
25
+ description = """
26
+ ### πŸŽ₯ **Translate videos easily with SoniTranslate!** πŸ“½οΈ
27
+
28
+ Upload a video or provide a video link. Limitation: 10 seconds for CPU, but no restrictions with a GPU.
29
+
30
+ For faster results and no duration limits, try the Colab notebook with a GPU:
31
+ [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://github.com/R3gm/SoniTranslate/blob/main/SoniTranslate_Colab.ipynb)
32
+
33
+ πŸ“½οΈ **This a demo of SoniTranslate; GitHub repository: [SoniTranslate](https://github.com/R3gm/SoniTranslate)!**
34
+
35
+ See the tab labeled 'Help' for instructions on how to use it. Let's start having fun with video translation! πŸš€πŸŽ‰
36
+ """
37
 
 
38
 
39
 
40
+ tutorial = """
41
+ ## πŸ”° **Instructions for use:**
 
 
 
42
 
43
+ 1. πŸ“€ **Upload a video** on the first tab or 🌐 **use a video link** on the second tab.
 
 
44
 
45
+ 2. 🌍 Choose the language in which you want to **translate the video**.
 
 
 
 
 
 
 
 
 
 
46
 
47
+ 3. πŸ—£οΈ Specify the **number of people speaking** in the video and **assign each one a text-to-speech voice** suitable for the translation language.
48
+
49
+ 4. πŸš€ Press the '**Translate**' button to obtain the results.
50
+ """
51
 
 
 
52
 
 
 
53
 
54
  # Check GPU
55
  if torch.cuda.is_available():
 
64
  whisper_model_default = 'base'
65
  print('Working in: ', device)
66
 
67
+ list_tts = ['af-ZA-AdriNeural-Female', 'af-ZA-WillemNeural-Male', 'am-ET-AmehaNeural-Male', 'am-ET-MekdesNeural-Female', 'ar-AE-FatimaNeural-Female', 'ar-AE-HamdanNeural-Male', 'ar-BH-AliNeural-Male', 'ar-BH-LailaNeural-Female', 'ar-DZ-AminaNeural-Female', 'ar-DZ-IsmaelNeural-Male', 'ar-EG-SalmaNeural-Female', 'ar-EG-ShakirNeural-Male', 'ar-IQ-BasselNeural-Male', 'ar-IQ-RanaNeural-Female', 'ar-JO-SanaNeural-Female', 'ar-JO-TaimNeural-Male', 'ar-KW-FahedNeural-Male', 'ar-KW-NouraNeural-Female', 'ar-LB-LaylaNeural-Female', 'ar-LB-RamiNeural-Male', 'ar-LY-ImanNeural-Female', 'ar-LY-OmarNeural-Male', 'ar-MA-JamalNeural-Male', 'ar-MA-MounaNeural-Female', 'ar-OM-AbdullahNeural-Male', 'ar-OM-AyshaNeural-Female', 'ar-QA-AmalNeural-Female', 'ar-QA-MoazNeural-Male', 'ar-SA-HamedNeural-Male', 'ar-SA-ZariyahNeural-Female', 'ar-SY-AmanyNeural-Female', 'ar-SY-LaithNeural-Male', 'ar-TN-HediNeural-Male', 'ar-TN-ReemNeural-Female', 'ar-YE-MaryamNeural-Female', 'ar-YE-SalehNeural-Male', 'az-AZ-BabekNeural-Male', 'az-AZ-BanuNeural-Female', 'bg-BG-BorislavNeural-Male', 'bg-BG-KalinaNeural-Female', 'bn-BD-NabanitaNeural-Female', 'bn-BD-PradeepNeural-Male', 'bn-IN-BashkarNeural-Male', 'bn-IN-TanishaaNeural-Female', 'bs-BA-GoranNeural-Male', 'bs-BA-VesnaNeural-Female', 'ca-ES-EnricNeural-Male', 'ca-ES-JoanaNeural-Female', 'cs-CZ-AntoninNeural-Male', 'cs-CZ-VlastaNeural-Female', 'cy-GB-AledNeural-Male', 'cy-GB-NiaNeural-Female', 'da-DK-ChristelNeural-Female', 'da-DK-JeppeNeural-Male', 'de-AT-IngridNeural-Female', 'de-AT-JonasNeural-Male', 'de-CH-JanNeural-Male', 'de-CH-LeniNeural-Female', 'de-DE-AmalaNeural-Female', 'de-DE-ConradNeural-Male', 'de-DE-KatjaNeural-Female', 'de-DE-KillianNeural-Male', 'el-GR-AthinaNeural-Female', 'el-GR-NestorasNeural-Male', 'en-AU-NatashaNeural-Female', 'en-AU-WilliamNeural-Male', 'en-CA-ClaraNeural-Female', 'en-CA-LiamNeural-Male', 'en-GB-LibbyNeural-Female', 'en-GB-MaisieNeural-Female', 'en-GB-RyanNeural-Male', 'en-GB-SoniaNeural-Female', 'en-GB-ThomasNeural-Male', 'en-HK-SamNeural-Male', 'en-HK-YanNeural-Female', 'en-IE-ConnorNeural-Male', 'en-IE-EmilyNeural-Female', 'en-IN-NeerjaExpressiveNeural-Female', 'en-IN-NeerjaNeural-Female', 'en-IN-PrabhatNeural-Male', 'en-KE-AsiliaNeural-Female', 'en-KE-ChilembaNeural-Male', 'en-NG-AbeoNeural-Male', 'en-NG-EzinneNeural-Female', 'en-NZ-MitchellNeural-Male', 'en-NZ-MollyNeural-Female', 'en-PH-JamesNeural-Male', 'en-PH-RosaNeural-Female', 'en-SG-LunaNeural-Female', 'en-SG-WayneNeural-Male', 'en-TZ-ElimuNeural-Male', 'en-TZ-ImaniNeural-Female', 'en-US-AnaNeural-Female', 'en-US-AriaNeural-Female', 'en-US-ChristopherNeural-Male', 'en-US-EricNeural-Male', 'en-US-GuyNeural-Male', 'en-US-JennyNeural-Female', 'en-US-MichelleNeural-Female', 'en-US-RogerNeural-Male', 'en-US-SteffanNeural-Male', 'en-ZA-LeahNeural-Female', 'en-ZA-LukeNeural-Male', 'es-AR-ElenaNeural-Female', 'es-AR-TomasNeural-Male', 'es-BO-MarceloNeural-Male', 'es-BO-SofiaNeural-Female', 'es-CL-CatalinaNeural-Female', 'es-CL-LorenzoNeural-Male', 'es-CO-GonzaloNeural-Male', 'es-CO-SalomeNeural-Female', 'es-CR-JuanNeural-Male', 'es-CR-MariaNeural-Female', 'es-CU-BelkysNeural-Female', 'es-CU-ManuelNeural-Male', 'es-DO-EmilioNeural-Male', 'es-DO-RamonaNeural-Female', 'es-EC-AndreaNeural-Female', 'es-EC-LuisNeural-Male', 'es-ES-AlvaroNeural-Male', 'es-ES-ElviraNeural-Female', 'es-GQ-JavierNeural-Male', 'es-GQ-TeresaNeural-Female', 'es-GT-AndresNeural-Male', 'es-GT-MartaNeural-Female', 'es-HN-CarlosNeural-Male', 'es-HN-KarlaNeural-Female', 'es-MX-DaliaNeural-Female', 'es-MX-JorgeNeural-Male', 'es-NI-FedericoNeural-Male', 'es-NI-YolandaNeural-Female', 'es-PA-MargaritaNeural-Female', 'es-PA-RobertoNeural-Male', 'es-PE-AlexNeural-Male', 'es-PE-CamilaNeural-Female', 'es-PR-KarinaNeural-Female', 'es-PR-VictorNeural-Male', 'es-PY-MarioNeural-Male', 'es-PY-TaniaNeural-Female', 'es-SV-LorenaNeural-Female', 'es-SV-RodrigoNeural-Male', 'es-US-AlonsoNeural-Male', 'es-US-PalomaNeural-Female', 'es-UY-MateoNeural-Male', 'es-UY-ValentinaNeural-Female', 'es-VE-PaolaNeural-Female', 'es-VE-SebastianNeural-Male', 'et-EE-AnuNeural-Female', 'et-EE-KertNeural-Male', 'fa-IR-DilaraNeural-Female', 'fa-IR-FaridNeural-Male', 'fi-FI-HarriNeural-Male', 'fi-FI-NooraNeural-Female', 'fil-PH-AngeloNeural-Male', 'fil-PH-BlessicaNeural-Female', 'fr-BE-CharlineNeural-Female', 'fr-BE-GerardNeural-Male', 'fr-CA-AntoineNeural-Male', 'fr-CA-JeanNeural-Male', 'fr-CA-SylvieNeural-Female', 'fr-CH-ArianeNeural-Female', 'fr-CH-FabriceNeural-Male', 'fr-FR-DeniseNeural-Female', 'fr-FR-EloiseNeural-Female', 'fr-FR-HenriNeural-Male', 'ga-IE-ColmNeural-Male', 'ga-IE-OrlaNeural-Female', 'gl-ES-RoiNeural-Male', 'gl-ES-SabelaNeural-Female', 'gu-IN-DhwaniNeural-Female', 'gu-IN-NiranjanNeural-Male', 'he-IL-AvriNeural-Male', 'he-IL-HilaNeural-Female', 'hi-IN-MadhurNeural-Male', 'hi-IN-SwaraNeural-Female', 'hr-HR-GabrijelaNeural-Female', 'hr-HR-SreckoNeural-Male', 'hu-HU-NoemiNeural-Female', 'hu-HU-TamasNeural-Male', 'id-ID-ArdiNeural-Male', 'id-ID-GadisNeural-Female', 'is-IS-GudrunNeural-Female', 'is-IS-GunnarNeural-Male', 'it-IT-DiegoNeural-Male', 'it-IT-ElsaNeural-Female', 'it-IT-IsabellaNeural-Female', 'ja-JP-KeitaNeural-Male', 'ja-JP-NanamiNeural-Female', 'jv-ID-DimasNeural-Male', 'jv-ID-SitiNeural-Female', 'ka-GE-EkaNeural-Female', 'ka-GE-GiorgiNeural-Male', 'kk-KZ-AigulNeural-Female', 'kk-KZ-DauletNeural-Male', 'km-KH-PisethNeural-Male', 'km-KH-SreymomNeural-Female', 'kn-IN-GaganNeural-Male', 'kn-IN-SapnaNeural-Female', 'ko-KR-InJoonNeural-Male', 'ko-KR-SunHiNeural-Female', 'lo-LA-ChanthavongNeural-Male', 'lo-LA-KeomanyNeural-Female', 'lt-LT-LeonasNeural-Male', 'lt-LT-OnaNeural-Female', 'lv-LV-EveritaNeural-Female', 'lv-LV-NilsNeural-Male', 'mk-MK-AleksandarNeural-Male', 'mk-MK-MarijaNeural-Female', 'ml-IN-MidhunNeural-Male', 'ml-IN-SobhanaNeural-Female', 'mn-MN-BataaNeural-Male', 'mn-MN-YesuiNeural-Female', 'mr-IN-AarohiNeural-Female', 'mr-IN-ManoharNeural-Male', 'ms-MY-OsmanNeural-Male', 'ms-MY-YasminNeural-Female', 'mt-MT-GraceNeural-Female', 'mt-MT-JosephNeural-Male', 'my-MM-NilarNeural-Female', 'my-MM-ThihaNeural-Male', 'nb-NO-FinnNeural-Male', 'nb-NO-PernilleNeural-Female', 'ne-NP-HemkalaNeural-Female', 'ne-NP-SagarNeural-Male', 'nl-BE-ArnaudNeural-Male', 'nl-BE-DenaNeural-Female', 'nl-NL-ColetteNeural-Female', 'nl-NL-FennaNeural-Female', 'nl-NL-MaartenNeural-Male', 'pl-PL-MarekNeural-Male', 'pl-PL-ZofiaNeural-Female', 'ps-AF-GulNawazNeural-Male', 'ps-AF-LatifaNeural-Female', 'pt-BR-AntonioNeural-Male', 'pt-BR-FranciscaNeural-Female', 'pt-PT-DuarteNeural-Male', 'pt-PT-RaquelNeural-Female', 'ro-RO-AlinaNeural-Female', 'ro-RO-EmilNeural-Male', 'ru-RU-DmitryNeural-Male', 'ru-RU-SvetlanaNeural-Female', 'si-LK-SameeraNeural-Male', 'si-LK-ThiliniNeural-Female', 'sk-SK-LukasNeural-Male', 'sk-SK-ViktoriaNeural-Female', 'sl-SI-PetraNeural-Female', 'sl-SI-RokNeural-Male', 'so-SO-MuuseNeural-Male', 'so-SO-UbaxNeural-Female', 'sq-AL-AnilaNeural-Female', 'sq-AL-IlirNeural-Male', 'sr-RS-NicholasNeural-Male', 'sr-RS-SophieNeural-Female', 'su-ID-JajangNeural-Male', 'su-ID-TutiNeural-Female', 'sv-SE-MattiasNeural-Male', 'sv-SE-SofieNeural-Female', 'sw-KE-RafikiNeural-Male', 'sw-KE-ZuriNeural-Female', 'sw-TZ-DaudiNeural-Male', 'sw-TZ-RehemaNeural-Female', 'ta-IN-PallaviNeural-Female', 'ta-IN-ValluvarNeural-Male', 'ta-LK-KumarNeural-Male', 'ta-LK-SaranyaNeural-Female', 'ta-MY-KaniNeural-Female', 'ta-MY-SuryaNeural-Male', 'ta-SG-AnbuNeural-Male', 'ta-SG-VenbaNeural-Female', 'te-IN-MohanNeural-Male', 'te-IN-ShrutiNeural-Female', 'th-TH-NiwatNeural-Male', 'th-TH-PremwadeeNeural-Female', 'tr-TR-AhmetNeural-Male', 'tr-TR-EmelNeural-Female', 'uk-UA-OstapNeural-Male', 'uk-UA-PolinaNeural-Female', 'ur-IN-GulNeural-Female', 'ur-IN-SalmanNeural-Male', 'ur-PK-AsadNeural-Male', 'ur-PK-UzmaNeural-Female', 'uz-UZ-MadinaNeural-Female', 'uz-UZ-SardorNeural-Male', 'vi-VN-HoaiMyNeural-Female', 'vi-VN-NamMinhNeural-Male', 'zh-CN-XiaoxiaoNeural-Female', 'zh-CN-XiaoyiNeural-Female', 'zh-CN-YunjianNeural-Male', 'zh-CN-YunxiNeural-Male', 'zh-CN-YunxiaNeural-Male', 'zh-CN-YunyangNeural-Male', 'zh-CN-liaoning-XiaobeiNeural-Female', 'zh-CN-shaanxi-XiaoniNeural-Female']
68
 
69
+ '''
70
+ def translate_from_video(video, WHISPER_MODEL_SIZE, batch_size, compute_type,
 
 
 
 
 
 
71
  TRANSLATE_AUDIO_TO, min_speakers, max_speakers,
72
  tts_voice00, tts_voice01,tts_voice02,tts_voice03,tts_voice04,tts_voice05):
73
 
74
  YOUR_HF_TOKEN = os.getenv("My_hf_token")
75
 
76
+ create_translated_audio(result_diarize, audio_files, Output_name_file)
77
+
78
+ os.system("rm audio_dub_stereo.wav")
79
+ os.system("ffmpeg -i audio_dub_solo.wav -ac 1 audio_dub_stereo.wav")
80
+
81
+ os.system(f"rm {mix_audio}")
82
+ os.system(f'ffmpeg -y -i audio.wav -i audio_dub_stereo.wav -filter_complex "[0:0]volume=0.15[a];[1:0]volume=1.90[b];[a][b]amix=inputs=2:duration=longest" -c:a libmp3lame {mix_audio}')
83
+
84
+ os.system(f"rm {video_output}")
85
+ os.system(f"ffmpeg -i {OutputFile} -i {mix_audio} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output}")
86
+
87
+ return video_output
88
+ '''
89
+
90
+ def translate_from_video(
91
+ video,
92
+ YOUR_HF_TOKEN,
93
+ preview=False,
94
+ WHISPER_MODEL_SIZE="large-v1",
95
+ batch_size=16,
96
+ compute_type="float16",
97
+ SOURCE_LANGUAGE= "Automatic detection",
98
+ TRANSLATE_AUDIO_TO="English (en)",
99
+ min_speakers=1,
100
+ max_speakers=2,
101
+ tts_voice00="en-AU-WilliamNeural-Male",
102
+ tts_voice01="en-CA-ClaraNeural-Female",
103
+ tts_voice02="en-GB-ThomasNeural-Male",
104
+ tts_voice03="en-GB-SoniaNeural-Female",
105
+ tts_voice04="en-NZ-MitchellNeural-Male",
106
+ tts_voice05="en-GB-MaisieNeural-Female",
107
+ video_output="video_dub.mp4",
108
+ AUDIO_MIX_METHOD='Adjusting volumes and mixing audio',
109
+ ):
110
+
111
+ if YOUR_HF_TOKEN == "" or YOUR_HF_TOKEN == None:
112
+ YOUR_HF_TOKEN = os.getenv("YOUR_HF_TOKEN")
113
+ if YOUR_HF_TOKEN == None:
114
+ print('No valid token')
115
+ return
116
+
117
+ if "SET_LIMIT" == os.getenv("DEMO"):
118
+ preview=True
119
+ print("DEMO; set preview=True; The generation is **limited to 10 seconds** to prevent errors with the CPU. If you use a GPU, you won't have any of these limitations.")
120
+ AUDIO_MIX_METHOD='Adjusting volumes and mixing audio'
121
+ print("DEMO; set Adjusting volumes and mixing audio")
122
+
123
+ LANGUAGES = {
124
+ 'Automatic detection': 'Automatic detection',
125
+ 'English (en)': 'en',
126
+ 'French (fr)': 'fr',
127
+ 'German (de)': 'de',
128
+ 'Spanish (es)': 'es',
129
+ 'Italian (it)': 'it',
130
+ 'Japanese (ja)': 'ja',
131
+ 'Chinese (zh)': 'zh',
132
+ 'Dutch (nl)': 'nl',
133
+ 'Ukrainian (uk)': 'uk',
134
+ 'Portuguese (pt)': 'pt'
135
+ }
136
+
137
+ TRANSLATE_AUDIO_TO = LANGUAGES[TRANSLATE_AUDIO_TO]
138
+ SOURCE_LANGUAGE = LANGUAGES[SOURCE_LANGUAGE]
139
+
140
+
141
+ if not os.path.exists('audio'):
142
+ os.makedirs('audio')
143
+
144
+ if not os.path.exists('audio2/audio'):
145
+ os.makedirs('audio2/audio')
146
+
147
+ # Check GPU
148
+ device = "cuda" if torch.cuda.is_available() else "cpu"
149
+ compute_type = "float32" if device == "cpu" else compute_type
150
+
151
  OutputFile = 'Video.mp4'
152
  audio_wav = "audio.wav"
153
+ Output_name_file = "audio_dub_solo.ogg"
154
  mix_audio = "audio_mix.mp3"
155
+
 
 
156
  os.system("rm Video.mp4")
157
+ os.system("rm audio.webm")
158
  os.system("rm audio.wav")
 
159
 
160
  if os.path.exists(video):
161
+ if preview:
162
+ print('Creating a preview video of 10 seconds, to disable this option, go to advanced settings and turn off preview.')
 
 
163
  os.system(f'ffmpeg -y -i "{video}" -ss 00:00:20 -t 00:00:10 -c:v libx264 -c:a aac -strict experimental Video.mp4')
164
  else:
165
  os.system(f'ffmpeg -y -i "{video}" -c:v libx264 -c:a aac -strict experimental Video.mp4')
166
+
167
  os.system("ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav")
168
  else:
169
+ if preview:
170
+ print('Creating a preview from the link, 10 seconds to disable this option, go to advanced settings and turn off preview.')
 
 
171
  #https://github.com/yt-dlp/yt-dlp/issues/2220
172
  mp4_ = f'yt-dlp -f "mp4" --downloader ffmpeg --downloader-args "ffmpeg_i: -ss 00:00:20 -t 00:00:10" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
173
+ wav_ = "ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav"
174
+ os.system(mp4_)
175
+ os.system(wav_)
176
  else:
177
  mp4_ = f'yt-dlp -f "mp4" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
178
  wav_ = f'python -m yt_dlp --output {audio_wav} --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --extract-audio --audio-format wav {video}'
179
+
180
+ os.system(wav_)
181
+
182
+ for i in range (120):
183
+ time.sleep(1)
184
+ print('process audio...')
185
+ if os.path.exists(audio_wav) and not os.path.exists('audio.webm'):
186
+ time.sleep(1)
187
+ os.system(mp4_)
188
+ break
189
+ if i == 119:
190
+ print('Error donwloading the audio')
191
+ return
192
 
193
  print("Set file complete.")
194
+
195
+ SOURCE_LANGUAGE = None if SOURCE_LANGUAGE == 'Automatic detection' else SOURCE_LANGUAGE
196
+
197
  # 1. Transcribe with original whisper (batched)
198
  model = whisperx.load_model(
199
  WHISPER_MODEL_SIZE,
200
  device,
201
+ compute_type=compute_type,
202
+ language= SOURCE_LANGUAGE,
203
  )
204
  audio = whisperx.load_audio(audio_wav)
205
  result = model.transcribe(audio, batch_size=batch_size)
206
  gc.collect(); torch.cuda.empty_cache(); del model
207
  print("Transcript complete")
208
+
209
  # 2. Align whisper output
210
  model_a, metadata = whisperx.load_align_model(
211
+ language_code=result["language"],
212
  device=device
213
  )
214
  result = whisperx.align(
 
221
  )
222
  gc.collect(); torch.cuda.empty_cache(); del model_a
223
  print("Align complete")
224
+
225
+ if result['segments'] == []:
226
+ print('No active speech found in audio')
227
+ return
228
+
229
  # 3. Assign speaker labels
230
  diarize_model = whisperx.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device)
231
  diarize_segments = diarize_model(
 
235
  result_diarize = whisperx.assign_word_speakers(diarize_segments, result)
236
  gc.collect(); torch.cuda.empty_cache(); del diarize_model
237
  print("Diarize complete")
238
+
239
  result_diarize['segments'] = translate_text(result_diarize['segments'], TRANSLATE_AUDIO_TO)
240
  print("Translation complete")
241
+
242
  audio_files = []
243
 
244
  # Mapping speakers to voice variables
 
251
  'SPEAKER_05': tts_voice05
252
  }
253
 
254
+ for segment in tqdm(result_diarize['segments']):
255
 
256
  text = segment['text']
257
  start = segment['start']
 
268
  filename = f"audio/{start}.ogg"
269
 
270
  if speaker in speaker_to_voice and speaker_to_voice[speaker] != 'None':
271
+ make_voice_gradio(text, speaker_to_voice[speaker], filename, TRANSLATE_AUDIO_TO)
272
  elif speaker == "SPEAKER_99":
273
  try:
274
  tts = gTTS(text, lang=TRANSLATE_AUDIO_TO)
 
277
  except:
278
  tts = gTTS('a', lang=TRANSLATE_AUDIO_TO)
279
  tts.save(filename)
280
+ print('Error: Audio will be replaced.')
281
 
282
  # duration
283
  duration_true = end - start
 
287
  porcentaje = duration_tts / duration_true
288
 
289
  if porcentaje > 2.1:
290
+ porcentaje = 2.1
291
  elif porcentaje <= 1.2 and porcentaje >= 0.8:
292
  porcentaje = 1.0
293
  elif porcentaje <= 0.79:
 
306
  os.system("mv -f audio2/audio/*.ogg audio/")
307
 
308
  os.system(f"rm {Output_name_file}")
 
309
  create_translated_audio(result_diarize, audio_files, Output_name_file)
310
 
 
 
 
 
 
311
  os.system(f"rm {mix_audio}")
312
+
313
+ # TYPE MIX AUDIO
314
+ if AUDIO_MIX_METHOD == 'Adjusting volumes and mixing audio':
315
+ # volume mix
316
+ os.system(f'ffmpeg -y -i {audio_wav} -i {Output_name_file} -filter_complex "[0:0]volume=0.15[a];[1:0]volume=1.90[b];[a][b]amix=inputs=2:duration=longest" -c:a libmp3lame {mix_audio}')
317
+ else:
318
+ try:
319
+ # background mix
320
+ os.system(f'ffmpeg -i {audio_wav} -i {Output_name_file} -filter_complex "[1:a]asplit=2[sc][mix];[0:a][sc]sidechaincompress=threshold=0.003:ratio=20[bg]; [bg][mix]amerge[final]" -map [final] {mix_audio}')
321
+ except:
322
+ # volume mix except
323
+ os.system(f'ffmpeg -y -i {audio_wav} -i {Output_name_file} -filter_complex "[0:0]volume=0.15[a];[1:0]volume=1.90[b];[a][b]amix=inputs=2:duration=longest" -c:a libmp3lame {mix_audio}')
324
+
325
  os.system(f"rm {video_output}")
326
  os.system(f"ffmpeg -i {OutputFile} -i {mix_audio} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output}")
 
 
 
327
 
328
+ return video_output
329
 
330
  import sys
331
 
 
352
  with open("output.log", "r") as f:
353
  return f.read()
354
 
355
+ # max tts
356
+ MAX_TTS = 6
357
 
358
+ theme='Taithrah/Minimal'
359
+
360
+ with gr.Blocks(theme=theme) as demo:
361
  gr.Markdown(title)
362
  gr.Markdown(description)
 
363
 
364
+ #### video
365
  with gr.Tab("Translate audio from video"):
366
  with gr.Row():
367
  with gr.Column():
368
  video_input = gr.Video() # height=300,width=300
369
+ SOURCE_LANGUAGE = gr.Dropdown(['Automatic detection', 'English (en)', 'French (fr)', 'German (de)', 'Spanish (es)', 'Italian (it)', 'Japanese (ja)', 'Chinese (zh)', 'Dutch (nl)', 'Ukrainian (uk)', 'Portuguese (pt)'], value='Automatic detection',label = 'Source language', info="This is the original language of the video")
370
+ TRANSLATE_AUDIO_TO = gr.Dropdown(['English (en)', 'French (fr)', 'German (de)', 'Spanish (es)', 'Italian (it)', 'Japanese (ja)', 'Chinese (zh)', 'Dutch (nl)', 'Ukrainian (uk)', 'Portuguese (pt)'], value='English (en)',label = 'Translate audio to', info="Select the target language, and make sure to select the language corresponding to the speakers of the target language to avoid errors in the process.")
371
+
372
+ line_ = gr.HTML("<hr></h2>")
373
  gr.Markdown("Select how many people are speaking in the video.")
374
+ min_speakers = gr.Slider(1, MAX_TTS, default=1, label="min_speakers", step=1, visible=False)
375
+ max_speakers = gr.Slider(1, MAX_TTS, value=2, step=1, label="Max speakers", interative=True)
 
376
  gr.Markdown("Select the voice you want for each speaker.")
377
+ def submit(value):
378
+ visibility_dict = {
379
+ f'tts_voice{i:02d}': gr.update(visible=i < value) for i in range(6)
380
+ }
381
+ return [value for value in visibility_dict.values()]
382
+ tts_voice00 = gr.Dropdown(list_tts, value='en-AU-WilliamNeural-Male', label = 'TTS Speaker 1', visible=True, interactive= True)
383
+ tts_voice01 = gr.Dropdown(list_tts, value='en-CA-ClaraNeural-Female', label = 'TTS Speaker 2', visible=True, interactive= True)
384
+ tts_voice02 = gr.Dropdown(list_tts, value='en-GB-ThomasNeural-Male', label = 'TTS Speaker 3', visible=False, interactive= True)
385
+ tts_voice03 = gr.Dropdown(list_tts, value='en-GB-SoniaNeural-Female', label = 'TTS Speaker 4', visible=False, interactive= True)
386
+ tts_voice04 = gr.Dropdown(list_tts, value='en-NZ-MitchellNeural-Male', label = 'TTS Speaker 5', visible=False, interactive= True)
387
+ tts_voice05 = gr.Dropdown(list_tts, value='en-GB-MaisieNeural-Female', label = 'TTS Speaker 6', visible=False, interactive= True)
388
+ max_speakers.change(submit, max_speakers, [tts_voice00, tts_voice01, tts_voice02, tts_voice03, tts_voice04, tts_voice05])
389
+
390
+ with gr.Column():
391
+ with gr.Accordion("Advanced Settings", open=False):
392
+
393
+ AUDIO_MIX = gr.Dropdown(['Mixing audio with sidechain compression', 'Adjusting volumes and mixing audio'], value='Adjusting volumes and mixing audio', label = 'Audio Mixing Method', info="Mix original and translated audio files to create a customized, balanced output with two available mixing modes.")
394
+
395
+ gr.HTML("<hr></h2>")
396
+ gr.Markdown("Default configuration of Whisper.")
397
+ WHISPER_MODEL_SIZE = gr.inputs.Dropdown(['tiny', 'base', 'small', 'medium', 'large-v1', 'large-v2'], default=whisper_model_default, label="Whisper model")
398
+ batch_size = gr.inputs.Slider(1, 32, default=16, label="Batch size", step=1)
399
+ compute_type = gr.inputs.Dropdown(list_compute_type, default=compute_type_default, label="Compute type")
400
+
401
+ gr.HTML("<hr></h2>")
402
+ VIDEO_OUTPUT_NAME = gr.Textbox(label="Translated file name" ,value="video_output.mp4", info="The name of the output file")
403
+ PREVIEW = gr.Checkbox(label="Preview", info="Preview cuts the video to only 10 seconds for testing purposes. Please deactivate it to retrieve the full video duration.")
404
+
405
+ with gr.Column(variant='compact'):
406
  with gr.Row():
407
  video_button = gr.Button("TRANSLATE", )
408
  with gr.Row():
409
  video_output = gr.Video()
410
 
411
+ line_ = gr.HTML("<hr></h2>")
412
+ if os.getenv("YOUR_HF_TOKEN") == None or os.getenv("YOUR_HF_TOKEN") == "":
413
+ HFKEY = gr.Textbox(visible= True, label="HF Token", info="One important step is to accept the license agreement for using Pyannote. You need to have an account on Hugging Face and accept the license to use the models: https://huggingface.co/pyannote/speaker-diarization and https://huggingface.co/pyannote/segmentation. Get your KEY TOKEN here: https://hf.co/settings/tokens", placeholder="Token goes here...")
414
+ else:
415
+ HFKEY = gr.Textbox(visible= False, label="HF Token", info="One important step is to accept the license agreement for using Pyannote. You need to have an account on Hugging Face and accept the license to use the models: https://huggingface.co/pyannote/speaker-diarization and https://huggingface.co/pyannote/segmentation. Get your KEY TOKEN here: https://hf.co/settings/tokens", placeholder="Token goes here...")
416
 
417
  gr.Examples(
418
  examples=[
419
  [
420
+ "./assets/Video_main.mp4",
421
+ "",
422
+ True,
423
  "base",
424
  16,
425
  "float32",
426
+ "Spanish (es)",
427
+ "English (en)",
428
  1,
429
  2,
430
  'en-AU-WilliamNeural-Male',
 
433
  'en-GB-SoniaNeural-Female',
434
  'en-NZ-MitchellNeural-Male',
435
  'en-GB-MaisieNeural-Female',
436
+ "video_output.mp4",
437
+ 'Adjusting volumes and mixing audio',
438
  ],
439
  ],
440
  fn=translate_from_video,
441
  inputs=[
442
  video_input,
443
+ HFKEY,
444
+ PREVIEW,
445
+ WHISPER_MODEL_SIZE,
446
  batch_size,
447
+ compute_type,
448
+ SOURCE_LANGUAGE,
449
+ TRANSLATE_AUDIO_TO,
450
  min_speakers,
451
  max_speakers,
452
  tts_voice00,
 
455
  tts_voice03,
456
  tts_voice04,
457
  tts_voice05,
458
+ VIDEO_OUTPUT_NAME,
459
+ AUDIO_MIX,
460
  ],
461
  outputs=[video_output],
462
+ cache_examples=False,
463
  )
464
 
465
+ ### link
466
 
467
  with gr.Tab("Translate audio from video link"):
468
  with gr.Row():
469
  with gr.Column():
470
+
471
+ blink_input = gr.Textbox(label="Media link.", info="Example: www.youtube.com/watch?v=g_9rPvbENUw", placeholder="URL goes here...")
472
+ # bSOURCE_LANGUAGE = gr.Dropdown(['Automatic detection', 'en', 'fr', 'de', 'es', 'it', 'ja', 'zh', 'nl', 'uk', 'pt'], value='en',label = 'Source language')
473
+
474
+ # gr.HTML("<hr></h2>")
475
+
476
+ # bHFKEY = gr.Textbox(label="HF Token", info="One important step is to accept the license agreement for using Pyannote. You need to have an account on Hugging Face and accept the license to use the models: https://huggingface.co/pyannote/speaker-diarization and https://huggingface.co/pyannote/segmentation. Get your KEY TOKEN here: https://hf.co/settings/tokens", placeholder="Token goes here...")
477
+
478
+ # gr.Markdown("Select the target language, and make sure to select the language corresponding to the speakers of the target language to avoid errors in the process.")
479
+ # bTRANSLATE_AUDIO_TO = gr.inputs.Dropdown(['en', 'fr', 'de', 'es', 'it', 'ja', 'zh', 'nl', 'uk', 'pt'], default='en',label = 'Translate audio to')
480
+
481
+ # gr.Markdown("Select how many people are speaking in the video.")
482
+ # bmin_speakers = gr.inputs.Slider(1, 6, default=1, label="min_speakers", step=1, )
483
+ # bmax_speakers = gr.inputs.Slider(1, 6, default=2, label="max_speakers",step=1)
484
+
485
+ # gr.Markdown("Select the voice you want for each speaker.")
486
+ # btts_voice00 = gr.inputs.Dropdown(list_tts, default='en-AU-WilliamNeural-Male', label = 'TTS Speaker 1')
487
+ # btts_voice01 = gr.inputs.Dropdown(list_tts, default='en-CA-ClaraNeural-Female', label = 'TTS Speaker 2')
488
+ # btts_voice02 = gr.inputs.Dropdown(list_tts, default='en-GB-ThomasNeural-Male', label = 'TTS Speaker 3')
489
+ # btts_voice03 = gr.inputs.Dropdown(list_tts, default='en-GB-SoniaNeural-Female', label = 'TTS Speaker 4')
490
+ # btts_voice04 = gr.inputs.Dropdown(list_tts, default='en-NZ-MitchellNeural-Male', label = 'TTS Speaker 5')
491
+ # btts_voice05 = gr.inputs.Dropdown(list_tts, default='en-GB-MaisieNeural-Female', label = 'TTS Speaker 6')
492
+
493
+ # with gr.Column():
494
+ # with gr.Accordion("Advanced Settings", open=False):
495
+ # gr.Markdown("Default configuration of Whisper.")
496
+ # bWHISPER_MODEL_SIZE = gr.inputs.Dropdown(['tiny', 'base', 'small', 'medium', 'large-v1', 'large-v2'], default=whisper_model_default, label="Whisper model")
497
+ # bbatch_size = gr.inputs.Slider(1, 32, default=16, label="Batch size", step=1)
498
+ # bcompute_type = gr.inputs.Dropdown(list_compute_type, default=compute_type_default, label="Compute type")
499
+
500
+ # bPREVIEW = gr.inputs.Checkbox(label="Preview cuts the video to only 10 seconds for testing purposes. Please deactivate it to retrieve the full video duration.")
501
+ # bVIDEO_OUTPUT_NAME = gr.Textbox(label="Translated file name" ,value="video_output.mp4")
502
+
503
+ bSOURCE_LANGUAGE = gr.Dropdown(['Automatic detection', 'English (en)', 'French (fr)', 'German (de)', 'Spanish (es)', 'Italian (it)', 'Japanese (ja)', 'Chinese (zh)', 'Dutch (nl)', 'Ukrainian (uk)', 'Portuguese (pt)'], value='Automatic detection',label = 'Source language', info="This is the original language of the video")
504
+ bTRANSLATE_AUDIO_TO = gr.Dropdown(['English (en)', 'French (fr)', 'German (de)', 'Spanish (es)', 'Italian (it)', 'Japanese (ja)', 'Chinese (zh)', 'Dutch (nl)', 'Ukrainian (uk)', 'Portuguese (pt)'], value='English (en)',label = 'Translate audio to', info="Select the target language, and make sure to select the language corresponding to the speakers of the target language to avoid errors in the process.")
505
+
506
+ bline_ = gr.HTML("<hr></h2>")
507
  gr.Markdown("Select how many people are speaking in the video.")
508
+ bmin_speakers = gr.Slider(1, MAX_TTS, default=1, label="min_speakers", step=1, visible=False)
509
+ bmax_speakers = gr.Slider(1, MAX_TTS, value=2, step=1, label="Max speakers", interative=True)
 
510
  gr.Markdown("Select the voice you want for each speaker.")
511
+ def bsubmit(value):
512
+ visibility_dict = {
513
+ f'btts_voice{i:02d}': gr.update(visible=i < value) for i in range(6)
514
+ }
515
+ return [value for value in visibility_dict.values()]
516
+ btts_voice00 = gr.Dropdown(list_tts, value='en-AU-WilliamNeural-Male', label = 'TTS Speaker 1', visible=True, interactive= True)
517
+ btts_voice01 = gr.Dropdown(list_tts, value='en-CA-ClaraNeural-Female', label = 'TTS Speaker 2', visible=True, interactive= True)
518
+ btts_voice02 = gr.Dropdown(list_tts, value='en-GB-ThomasNeural-Male', label = 'TTS Speaker 3', visible=False, interactive= True)
519
+ btts_voice03 = gr.Dropdown(list_tts, value='en-GB-SoniaNeural-Female', label = 'TTS Speaker 4', visible=False, interactive= True)
520
+ btts_voice04 = gr.Dropdown(list_tts, value='en-NZ-MitchellNeural-Male', label = 'TTS Speaker 5', visible=False, interactive= True)
521
+ btts_voice05 = gr.Dropdown(list_tts, value='en-GB-MaisieNeural-Female', label = 'TTS Speaker 6', visible=False, interactive= True)
522
+ bmax_speakers.change(bsubmit, bmax_speakers, [btts_voice00, btts_voice01, btts_voice02, btts_voice03, btts_voice04, btts_voice05])
523
+
524
+
525
+ with gr.Column():
526
+ with gr.Accordion("Advanced Settings", open=False):
527
+
528
+ bAUDIO_MIX = gr.Dropdown(['Mixing audio with sidechain compression', 'Adjusting volumes and mixing audio'], value='Adjusting volumes and mixing audio', label = 'Audio Mixing Method', info="Mix original and translated audio files to create a customized, balanced output with two available mixing modes.")
529
+
530
+ gr.HTML("<hr></h2>")
531
+ gr.Markdown("Default configuration of Whisper.")
532
+ bWHISPER_MODEL_SIZE = gr.inputs.Dropdown(['tiny', 'base', 'small', 'medium', 'large-v1', 'large-v2'], default=whisper_model_default, label="Whisper model")
533
+ bbatch_size = gr.inputs.Slider(1, 32, default=16, label="Batch size", step=1)
534
+ bcompute_type = gr.inputs.Dropdown(list_compute_type, default=compute_type_default, label="Compute type")
535
+
536
+ gr.HTML("<hr></h2>")
537
+ bVIDEO_OUTPUT_NAME = gr.Textbox(label="Translated file name" ,value="video_output.mp4", info="The name of the output file")
538
+ bPREVIEW = gr.Checkbox(label="Preview", info="Preview cuts the video to only 10 seconds for testing purposes. Please deactivate it to retrieve the full video duration.")
539
+
540
+
541
+
542
  # text_button = gr.Button("Translate audio of video")
543
  # link_output = gr.Video() #gr.outputs.File(label="Download!")
544
 
545
 
546
 
547
+ with gr.Column(variant='compact'):
548
  with gr.Row():
549
  text_button = gr.Button("TRANSLATE")
550
  with gr.Row():
551
+ blink_output = gr.Video() #gr.outputs.File(label="Download!") # gr.Video()
552
+
553
+
554
+ bline_ = gr.HTML("<hr></h2>")
555
+ if os.getenv("YOUR_HF_TOKEN") == None or os.getenv("YOUR_HF_TOKEN") == "":
556
+ bHFKEY = gr.Textbox(visible= True, label="HF Token", info="One important step is to accept the license agreement for using Pyannote. You need to have an account on Hugging Face and accept the license to use the models: https://huggingface.co/pyannote/speaker-diarization and https://huggingface.co/pyannote/segmentation. Get your KEY TOKEN here: https://hf.co/settings/tokens", placeholder="Token goes here...")
557
+ else:
558
+ bHFKEY = gr.Textbox(visible= False, label="HF Token", info="One important step is to accept the license agreement for using Pyannote. You need to have an account on Hugging Face and accept the license to use the models: https://huggingface.co/pyannote/speaker-diarization and https://huggingface.co/pyannote/segmentation. Get your KEY TOKEN here: https://hf.co/settings/tokens", placeholder="Token goes here...")
559
 
560
  gr.Examples(
561
  examples=[
562
  [
563
  "https://www.youtube.com/watch?v=5ZeHtRKHl7Y",
564
+ "",
565
+ True,
566
  "base",
567
  16,
568
  "float32",
569
+ "Japanese (ja)",
570
+ "English (en)",
571
  1,
572
  2,
573
  'en-CA-ClaraNeural-Female',
 
576
  'en-GB-SoniaNeural-Female',
577
  'en-NZ-MitchellNeural-Male',
578
  'en-GB-MaisieNeural-Female',
579
+ "video_output.mp4",
580
+ 'Adjusting volumes and mixing audio',
581
  ],
582
  ],
583
  fn=translate_from_video,
584
  inputs=[
585
+ blink_input,
586
+ bHFKEY,
587
+ bPREVIEW,
588
+ bWHISPER_MODEL_SIZE,
589
  bbatch_size,
590
+ bcompute_type,
591
+ bSOURCE_LANGUAGE,
592
+ bTRANSLATE_AUDIO_TO,
593
  bmin_speakers,
594
  bmax_speakers,
595
  btts_voice00,
 
598
  btts_voice03,
599
  btts_voice04,
600
  btts_voice05,
601
+ bVIDEO_OUTPUT_NAME,
602
+ bAUDIO_MIX
603
  ],
604
+ outputs=[blink_output],
605
+ cache_examples=False,
606
  )
607
 
608
 
609
+
610
+
611
+ with gr.Tab("Help"):
612
+ gr.Markdown(news)
613
+ gr.Markdown(tutorial)
614
+
615
+ with gr.Accordion("Logs", open = False):
616
  logs = gr.Textbox()
617
  demo.load(read_logs, None, logs, every=1)
618
 
619
  # run
620
  video_button.click(translate_from_video, inputs=[
621
+ video_input,
622
+ HFKEY,
623
+ PREVIEW,
624
+ WHISPER_MODEL_SIZE,
625
  batch_size,
626
+ compute_type,
627
+ SOURCE_LANGUAGE,
628
+ TRANSLATE_AUDIO_TO,
629
  min_speakers,
630
  max_speakers,
631
  tts_voice00,
 
633
  tts_voice02,
634
  tts_voice03,
635
  tts_voice04,
636
+ tts_voice05,
637
+ VIDEO_OUTPUT_NAME,
638
+ AUDIO_MIX,
639
+ ], outputs=video_output)
640
  text_button.click(translate_from_video, inputs=[
641
+ blink_input,
642
+ bHFKEY,
643
+ bPREVIEW,
644
+ bWHISPER_MODEL_SIZE,
645
  bbatch_size,
646
+ bcompute_type,
647
+ bSOURCE_LANGUAGE,
648
+ bTRANSLATE_AUDIO_TO,
649
  bmin_speakers,
650
  bmax_speakers,
651
  btts_voice00,
 
653
  btts_voice02,
654
  btts_voice03,
655
  btts_voice04,
656
+ btts_voice05,
657
+ bVIDEO_OUTPUT_NAME,
658
+ bAUDIO_MIX,
659
+ ], outputs=blink_output)
660
 
661
  demo.launch(enable_queue=True)
662
+ #demo.launch()