RASMUS commited on
Commit
07903ab
1 Parent(s): 4970504

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -42
app.py CHANGED
@@ -21,13 +21,25 @@ import torch
21
 
22
  # is cuda available?
23
 
24
- from easynmt import EasyNMT
25
- translation_model = EasyNMT('m2m_100_418M', max_new_tokens=60)
26
 
27
- asr_model = whisper.load_model("base")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  transcribe_options = dict(beam_size=3, best_of=3, without_timestamps=False)
29
 
30
- translation_models = {
31
  "Afrikaans":"af",
32
  "Amharic":"am",
33
  "Arabic":"ar",
@@ -127,15 +139,44 @@ translation_models = {
127
  "Yiddish":"yi",
128
  "Yoruba":"yo",
129
  "Chinese":"zh",
130
- "Zulu":"zu"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  }
132
 
133
- translation_models_list = [key[0] for key in translation_models.items()]
134
 
 
 
135
 
136
- device = "cpu"#torch.device("cuda" if torch.cuda.is_available() else "cpu")
137
- print("DEVICE IS: ")
138
- print(device)
139
 
140
  videos_out_path = Path("./videos_out")
141
  videos_out_path.mkdir(parents=True, exist_ok=True)
@@ -148,18 +189,19 @@ def get_youtube(video_url):
148
 
149
  return abs_video_path
150
 
151
- async def speech_to_text(video_file_path, selected_translation_lang):
152
  """
153
  # Youtube with translated subtitles using OpenAI Whisper and Opus-MT models.
154
  # Currently supports only English audio
155
  This space allows you to:
156
  1. Download youtube video with a given url
157
  2. Watch it in the first video component
158
- 3. Run automatic speech recognition on the video using Whisper
159
- 4. Translate the recognized transcriptions to Finnish, Swedish, Danish
160
  5. Burn the translations to the original video and watch the video in the 2nd video component
161
 
162
- Speech Recognition is based on OpenAI Whisper https://github.com/openai/whisper
 
163
  """
164
 
165
  if(video_file_path == None):
@@ -193,20 +235,48 @@ async def speech_to_text(video_file_path, selected_translation_lang):
193
  }
194
  df = df.append(new_row, ignore_index=True)
195
 
196
- if selected_translation_lang is None:
197
- selected_translation_lang = 'Finnish'
198
-
199
- sentences = df['text']
200
- df['translation'] = translation_model.translate(sentences, target_lang=translation_models.get(selected_translation_lang), max_new_tokens = 50)
201
-
202
-
203
- print('After translation to target language \n')
204
-
205
  return (df)
206
  except Exception as e:
207
  raise RuntimeError("Error Running inference with local model", e)
208
 
209
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  def create_srt_and_burn(df, video_in):
211
 
212
  print("Starting creation of video wit srt")
@@ -286,10 +356,12 @@ video_out = gr.Video(label="Video Out", mirror_webcam=False)
286
 
287
 
288
  df_init = pd.DataFrame(columns=['start','end','text','translation'])
289
- selected_translation_lang = gr.Dropdown(choices=translation_models_list, type="value", value="English", label="Language to translate transcriptions to", interactive=True)
290
-
291
- transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10)
292
 
 
 
293
 
294
  demo = gr.Blocks(css='''
295
  #cut_btn, #reset_btn { align-self:stretch; }
@@ -306,14 +378,14 @@ with demo:
306
  ### This space allows you to:
307
  ##### 1. Download youtube video with a given URL
308
  ##### 2. Watch it in the first video component
309
- ##### 3. Run automatic speech recognition on the video using Whisper (Please remember to select translation language)
310
- ##### 4. Translate the recognized transcriptions to Finnish, Swedish, Danish
311
  ##### 5. Burn the translations to the original video and watch the video in the 2nd video component
312
  ''')
313
 
314
  with gr.Column():
315
  gr.Markdown('''
316
- ### 1. Insert Youtube URL below (Some examples below which I suggest to use for first tests)
317
  ##### 1. https://www.youtube.com/watch?v=nlMuHtV82q8&ab_channel=NothingforSale24
318
  ##### 2. https://www.youtube.com/watch?v=JzPfMbG1vrE&ab_channel=ExplainerVideosByLauren
319
  ##### 3. https://www.youtube.com/watch?v=S68vvV0kod8&ab_channel=Pearl-CohnTelevision
@@ -334,20 +406,17 @@ with demo:
334
  with gr.Column():
335
  gr.Markdown('''
336
  ##### Here you can start the transcription and translation process.
337
- ##### Be aware that processing will last for a while (35 second video took around 20 seconds in my testing)
338
  ''')
339
- transcribe_btn = gr.Button("Step 2. Transcribe and translate audio")
340
-
341
- transcribe_btn.click(speech_to_text, [video_in, selected_translation_lang], transcription_df)
342
-
343
- with gr.Row():
344
- with gr.Column():
345
- selected_translation_lang.render()
346
 
347
  with gr.Row():
348
  gr.Markdown('''
349
- ##### Here you will get transcription and translation output
350
- ##### If you see error please remember to select translation language
351
  ##### ''')
352
 
353
  with gr.Row():
@@ -356,12 +425,25 @@ with demo:
356
 
357
  with gr.Row():
358
  with gr.Column():
359
- translate_and_make_srt_btn = gr.Button("Step 3. Create and burn srt to video")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
360
  print(video_in)
361
- translate_and_make_srt_btn.click(create_srt_and_burn, [transcription_df,video_in], [
362
  video_out])
363
  video_out.render()
364
 
365
 
366
- if __name__ == "__main__":
367
- demo.queue().launch(debug=True, share=False, enable_queue=True)
 
21
 
22
  # is cuda available?
23
 
 
 
24
 
25
+ num_cores = psutil.cpu_count()
26
+ os.environ["OMP_NUM_THREADS"] = f"{num_cores}"
27
+ headers = {'Authorization': os.environ['DeepL_API_KEY']}
28
+
29
+ device = "cpu"#torch.device("cuda" if torch.cuda.is_available() else "cpu")
30
+ print("DEVICE IS: ")
31
+ print(device)
32
+
33
+ asr_model_base = whisper.load_model("base", device=device)
34
+ asr_model_small = whisper.load_model("small", device=device)
35
+ whisper_models = {
36
+ 'base': asr_model_base,
37
+ 'small': asr_model_small
38
+ }
39
+
40
  transcribe_options = dict(beam_size=3, best_of=3, without_timestamps=False)
41
 
42
+ source_languages = {
43
  "Afrikaans":"af",
44
  "Amharic":"am",
45
  "Arabic":"ar",
 
139
  "Yiddish":"yi",
140
  "Yoruba":"yo",
141
  "Chinese":"zh",
142
+ "Zulu":"zu",
143
+ "Let the model analyze": "Let the model analyze"
144
+ }
145
+
146
+ DeepL_language_codes_for_translation = {
147
+ "Bulgarian": "BG",
148
+ "Czech": "CS",
149
+ "Danish": "DA",
150
+ "German": "DE",
151
+ "Greek": "EL",
152
+ "English": "EN",
153
+ "Spanish": "ES",
154
+ "Estonian": "ET",
155
+ "Finnish": "FI",
156
+ "French": "FR",
157
+ "Hungarian": "HU",
158
+ "Indonesian": "ID",
159
+ "Italian": "IT",
160
+ "Japanese": "JA",
161
+ "Lithuanian": "LT",
162
+ "Latvian": "LV",
163
+ "Dutch": "NL",
164
+ "Polish": "PL",
165
+ "Portuguese": "PT",
166
+ "Romanian": "RO",
167
+ "Russian": "RU",
168
+ "Slovak": "SK",
169
+ "Slovenian": "SL",
170
+ "Swedish": "SV",
171
+ "Turkish": "TR",
172
+ "Ukrainian": "UK",
173
+ "Chinese": "ZH"
174
  }
175
 
 
176
 
177
+ source_language_list = [key[0] for key in source_languages.items()]
178
+ translation_models_list = [key[0] for key in DeepL_language_codes_for_translation.items()]
179
 
 
 
 
180
 
181
  videos_out_path = Path("./videos_out")
182
  videos_out_path.mkdir(parents=True, exist_ok=True)
 
189
 
190
  return abs_video_path
191
 
192
+ async def speech_to_text(video_file_path, selected_translation_lang, whisper_model):
193
  """
194
  # Youtube with translated subtitles using OpenAI Whisper and Opus-MT models.
195
  # Currently supports only English audio
196
  This space allows you to:
197
  1. Download youtube video with a given url
198
  2. Watch it in the first video component
199
+ 3. Run automatic speech recognition on the video using fast Whisper models
200
+ 4. Translate the recognized transcriptions to 26 languages supported by deepL (If source language not supported this will return original transciption)
201
  5. Burn the translations to the original video and watch the video in the 2nd video component
202
 
203
+ Speech Recognition is based on models from OpenAI Whisper https://github.com/openai/whisper
204
+ This space is using c++ implementation by https://github.com/ggerganov/whisper.cpp
205
  """
206
 
207
  if(video_file_path == None):
 
235
  }
236
  df = df.append(new_row, ignore_index=True)
237
 
 
 
 
 
 
 
 
 
 
238
  return (df)
239
  except Exception as e:
240
  raise RuntimeError("Error Running inference with local model", e)
241
 
242
 
243
+
244
+ def translate_transcriptions(df, selected_translation_lang_2):
245
+ if selected_translation_lang_2 is None:
246
+ selected_translation_lang_2 = 'English'
247
+ df.reset_index(inplace=True)
248
+
249
+ print("start_translation")
250
+ translations = []
251
+
252
+
253
+
254
+ text_combined = ""
255
+ for i, sentence in enumerate(df['text']):
256
+ if i == 0:
257
+ text_combined = sentence
258
+ else:
259
+ text_combined = text_combined + '\n' + sentence
260
+
261
+ data = {'text': text_combined,
262
+ 'tag_spitting': 'xml',
263
+ 'target_lang': DeepL_language_codes_for_translation.get(selected_translation_lang_2)
264
+ }
265
+ try:
266
+ response = requests.post('https://api-free.deepl.com/v2/translate', headers=headers, data=data)
267
+
268
+ # Print the response from the server
269
+ translated_sentences = json.loads(response.text)
270
+ translated_sentences = translated_sentences['translations'][0]['text'].split('\n')
271
+ df['translation'] = translated_sentences
272
+ except Exception as e:
273
+ print(e)
274
+ df['translation'] = df['text']
275
+
276
+ print("translations done")
277
+
278
+ return df
279
+
280
  def create_srt_and_burn(df, video_in):
281
 
282
  print("Starting creation of video wit srt")
 
356
 
357
 
358
  df_init = pd.DataFrame(columns=['start','end','text','translation'])
359
+ selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="Let the model analyze", label="Spoken language in video", interactive=True)
360
+ selected_translation_lang_2 = gr.Dropdown(choices=translation_models_list, type="value", value="English", label="In which language you want the transcriptions?", interactive=True)
361
+ selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="base", label="Selected Whisper model", interactive=True)
362
 
363
+ transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
364
+ transcription_and_translation_df = gr.DataFrame(value=df_init,label="Transcription and translation dataframe", max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
365
 
366
  demo = gr.Blocks(css='''
367
  #cut_btn, #reset_btn { align-self:stretch; }
 
378
  ### This space allows you to:
379
  ##### 1. Download youtube video with a given URL
380
  ##### 2. Watch it in the first video component
381
+ ##### 3. Run automatic speech recognition on the video using Whisper
382
+ ##### 4. Translate the recognized transcriptions to 26 languages supported by deepL
383
  ##### 5. Burn the translations to the original video and watch the video in the 2nd video component
384
  ''')
385
 
386
  with gr.Column():
387
  gr.Markdown('''
388
+ ### 1. Insert Youtube URL below. Some test videos below:
389
  ##### 1. https://www.youtube.com/watch?v=nlMuHtV82q8&ab_channel=NothingforSale24
390
  ##### 2. https://www.youtube.com/watch?v=JzPfMbG1vrE&ab_channel=ExplainerVideosByLauren
391
  ##### 3. https://www.youtube.com/watch?v=S68vvV0kod8&ab_channel=Pearl-CohnTelevision
 
406
  with gr.Column():
407
  gr.Markdown('''
408
  ##### Here you can start the transcription and translation process.
409
+ ##### Be aware that processing will last some time. With base model it is around 3x speed
410
  ''')
411
+ selected_source_lang.render()
412
+ selected_whisper_model.render()
413
+ transcribe_btn = gr.Button("Step 2. Transcribe audio")
414
+ transcribe_btn.click(speech_to_text, [video_in, selected_source_lang, selected_whisper_model], transcription_df)
415
+
 
 
416
 
417
  with gr.Row():
418
  gr.Markdown('''
419
+ ##### Here you will get transcription output
 
420
  ##### ''')
421
 
422
  with gr.Row():
 
425
 
426
  with gr.Row():
427
  with gr.Column():
428
+ gr.Markdown('''
429
+ ##### Here you will get translated transcriptions.
430
+ ##### Please remember to select target language
431
+ ##### ''')
432
+ selected_translation_lang_2.render()
433
+ translate_transcriptions_button = gr.Button("Step 3. Translate transcription")
434
+ translate_transcriptions_button.click(translate_transcriptions, [transcription_df, selected_translation_lang_2], transcription_and_translation_df)
435
+ transcription_and_translation_df.render()
436
+
437
+ with gr.Row():
438
+ with gr.Column():
439
+ gr.Markdown('''
440
+ ##### Now press the Step 4. Button to create output video with translated transcriptions
441
+ ##### ''')
442
+ translate_and_make_srt_btn = gr.Button("Step 4. Create and burn srt to video")
443
  print(video_in)
444
+ translate_and_make_srt_btn.click(create_srt_and_burn, [transcription_and_translation_df,video_in], [
445
  video_out])
446
  video_out.render()
447
 
448
 
449
+ demo.launch()