RASMUS commited on
Commit
3e2d289
1 Parent(s): 178aea2

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +378 -0
app.py ADDED
@@ -0,0 +1,378 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+
4
+ os.system('git clone https://github.com/ggerganov/whisper.cpp.git')
5
+ os.system('make -C ./whisper.cpp')
6
+ os.system('bash ./whisper.cpp/models/download-ggml-model.sh small')
7
+ os.system('bash ./whisper.cpp/models/download-ggml-model.sh base')
8
+ os.system('bash ./whisper.cpp/models/download-ggml-model.sh medium')
9
+ os.system('bash ./whisper.cpp/models/download-ggml-model.sh base.en')
10
+
11
+ #os.system('./whisper.cpp/main -m whisper.cpp/models/ggml-base.en.bin -f whisper.cpp/samples/jfk.wav')
12
+ #print("SEURAAVAKSI SMALL TESTI")
13
+ #os.system('./whisper.cpp/main -m whisper.cpp/models/ggml-small.bin -f whisper.cpp/samples/jfk.wav')
14
+ #print("MOI")
15
+
16
+
17
+
18
+ import os
19
+
20
+
21
+ import gradio as gr
22
+ import os
23
+ from pathlib import Path
24
+ import pysrt
25
+ import pandas as pd
26
+ import re
27
+ import time
28
+ import os
29
+
30
+ from pytube import YouTube
31
+ from transformers import MarianMTModel, MarianTokenizer
32
+
33
+ import psutil
34
+ num_cores = psutil.cpu_count()
35
+ os.environ["OMP_NUM_THREADS"] = f"{num_cores}"
36
+
37
+
38
+ import torch
39
+
40
+
41
+ finnish_marian_nmt_model = "Helsinki-NLP/opus-mt-tc-big-en-fi"
42
+ finnish_tokenizer_marian = MarianTokenizer.from_pretrained(finnish_marian_nmt_model, max_length=40)
43
+ finnish_tokenizer_marian.max_new_tokens = 30
44
+ finnish_translation_model = MarianMTModel.from_pretrained(finnish_marian_nmt_model)
45
+
46
+ swedish_marian_nmt_model = "Helsinki-NLP/opus-mt-en-sv"
47
+ swedish_tokenizer_marian = MarianTokenizer.from_pretrained(swedish_marian_nmt_model, max_length=40)
48
+ swedish_tokenizer_marian.max_new_tokens = 30
49
+ swedish_translation_model = MarianMTModel.from_pretrained(swedish_marian_nmt_model)
50
+
51
+ danish_marian_nmt_model = "Helsinki-NLP/opus-mt-en-da"
52
+ danish_tokenizer_marian = MarianTokenizer.from_pretrained(danish_marian_nmt_model, max_length=40)
53
+ danish_tokenizer_marian.max_new_tokens = 30
54
+ danish_translation_model = MarianMTModel.from_pretrained(danish_marian_nmt_model)
55
+
56
+
57
+ translation_models = {
58
+ "Finnish": [finnish_tokenizer_marian, finnish_translation_model],
59
+ "Swedish": [swedish_tokenizer_marian, swedish_translation_model],
60
+ "Danish": [danish_tokenizer_marian, danish_translation_model]
61
+ }
62
+
63
+ whisper_models = ["base", "small", "medium", "base.en"]
64
+
65
+
66
+ source_languages = {
67
+ "Arabic": "ar",
68
+ "Asturian ":"st",
69
+ "Belarusian":"be",
70
+ "Bulgarian":"bg",
71
+ "Czech":"cs",
72
+ "Danish":"da",
73
+ "German":"de",
74
+ "Greeek":"el",
75
+ "English":"en",
76
+ "Estonian":"et",
77
+ "Finnish":"fi",
78
+ "Swedish": "sv",
79
+ "Spanish":"es",
80
+ "Let the model analyze": "Let the model analyze"
81
+ }
82
+
83
+ source_languages_2 = {
84
+ "English":"en",
85
+ }
86
+
87
+
88
+
89
+ transcribe_options = dict(beam_size=3, best_of=3, without_timestamps=False)
90
+
91
+
92
+ source_language_list = [key[0] for key in source_languages.items()]
93
+ source_language_list_2 = [key[0] for key in source_languages_2.items()]
94
+ translation_models_list = [key[0] for key in translation_models.items()]
95
+
96
+
97
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
98
+ print("DEVICE IS: ")
99
+ print(device)
100
+
101
+ videos_out_path = Path("./videos_out")
102
+ videos_out_path.mkdir(parents=True, exist_ok=True)
103
+
104
+
105
+ def get_youtube(video_url):
106
+ yt = YouTube(video_url)
107
+ abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download()
108
+ print("LADATATTU POLKUUN")
109
+ print(abs_video_path)
110
+
111
+
112
+ return abs_video_path
113
+
114
+ def speech_to_text(video_file_path, selected_source_lang, whisper_model):
115
+ """
116
+ # Youtube with translated subtitles using OpenAI Whisper and Opus-MT models.
117
+ # Currently supports only English audio
118
+ This space allows you to:
119
+ 1. Download youtube video with a given url
120
+ 2. Watch it in the first video component
121
+ 3. Run automatic speech recognition on the video using Whisper
122
+ 4. Translate the recognized transcriptions to Finnish, Swedish, Danish
123
+ 5. Burn the translations to the original video and watch the video in the 2nd video component
124
+
125
+ Speech Recognition is based on OpenAI Whisper https://github.com/openai/whisper
126
+ """
127
+
128
+ if(video_file_path == None):
129
+ raise ValueError("Error no video input")
130
+ print(video_file_path)
131
+ try:
132
+ _,file_ending = os.path.splitext(f'{video_file_path}')
133
+ print(f'file enging is {file_ending}')
134
+ print("starting conversion to wav")
135
+ os.system(f'ffmpeg -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{video_file_path.replace(file_ending, ".wav")}"')
136
+ print("conversion to wav ready")
137
+
138
+
139
+
140
+ print("starting whisper c++")
141
+ srt_path = str(video_file_path.replace(file_ending, ".wav")) + ".srt"
142
+ os.system(f'rm -f {srt_path}')
143
+ if selected_source_lang == "Let the model analyze":
144
+ os.system(f'./whisper.cpp/main "{video_file_path.replace(file_ending, ".wav")}" -t 4 -m ./whisper.cpp/models/ggml-{whisper_model}.bin -osrt')
145
+ else:
146
+ os.system(f'./whisper.cpp/main "{video_file_path.replace(file_ending, ".wav")}" -t 4 -l {source_languages.get(selected_source_lang)} -m ./whisper.cpp/models/ggml-{whisper_model}.bin -osrt')
147
+ print("starting whisper done with whisper")
148
+ except Exception as e:
149
+ raise RuntimeError("Error converting video to audio")
150
+
151
+ try:
152
+
153
+
154
+ df = pd.DataFrame(columns = ['start','end','text'])
155
+ srt_path = str(video_file_path.replace(file_ending, ".wav")) + ".srt"
156
+ subs = pysrt.open(srt_path)
157
+
158
+
159
+ objects = []
160
+ for sub in subs:
161
+
162
+
163
+ start_hours = str(str(sub.start.hours) + "00")[0:2] if len(str(sub.start.hours)) == 2 else str("0" + str(sub.start.hours) + "00")[0:2]
164
+ end_hours = str(str(sub.end.hours) + "00")[0:2] if len(str(sub.end.hours)) == 2 else str("0" + str(sub.end.hours) + "00")[0:2]
165
+
166
+ start_minutes = str(str(sub.start.minutes) + "00")[0:2] if len(str(sub.start.minutes)) == 2 else str("0" + str(sub.start.minutes) + "00")[0:2]
167
+ end_minutes = str(str(sub.end.minutes) + "00")[0:2] if len(str(sub.end.minutes)) == 2 else str("0" + str(sub.end.minutes) + "00")[0:2]
168
+
169
+ start_seconds = str(str(sub.start.seconds) + "00")[0:2] if len(str(sub.start.seconds)) == 2 else str("0" + str(sub.start.seconds) + "00")[0:2]
170
+ end_seconds = str(str(sub.end.seconds) + "00")[0:2] if len(str(sub.end.seconds)) == 2 else str("0" + str(sub.end.seconds) + "00")[0:2]
171
+
172
+ start_millis = str(str(sub.start.milliseconds) + "000")[0:3]
173
+ end_millis = str(str(sub.end.milliseconds) + "000")[0:3]
174
+ objects.append([sub.text, f'{start_hours}:{start_minutes}:{start_seconds}.{start_millis}', f'{end_hours}:{end_minutes}:{end_seconds}.{end_millis}'])
175
+
176
+ for object in objects:
177
+ srt_to_df = {
178
+ 'start': [object[1]],
179
+ 'end': [object[2]],
180
+ 'text': [object[0]]
181
+ }
182
+
183
+ df = pd.concat([df, pd.DataFrame(srt_to_df)])
184
+
185
+
186
+ return df
187
+
188
+ except Exception as e:
189
+ raise RuntimeError("Error Running inference with local model", e)
190
+
191
+
192
+ def translate_transcriptions(df, selected_translation_lang_2, selected_source_lang_2):
193
+ print("IN TRANSLATE")
194
+
195
+ if selected_translation_lang_2 is None:
196
+ selected_translation_lang_2 = 'Finnish'
197
+ df.reset_index(inplace=True)
198
+
199
+ print("Getting models")
200
+
201
+ tokenizer_marian = translation_models.get(selected_translation_lang_2)[0]
202
+ translation_model = translation_models.get(selected_translation_lang_2)[1]
203
+
204
+ print("start_translation")
205
+ translations = []
206
+ print(df.head())
207
+ if selected_translation_lang_2 != selected_source_lang_2:
208
+ print("TRASNLATING")
209
+ sentences = list(df['text'])
210
+ sentences = [stringi.replace('[','').replace(']','') for stringi in sentences]
211
+ translations = translation_model.generate(**tokenizer_marian(sentences, return_tensors="pt", padding=True, truncation=True))
212
+ print(translations)
213
+ df['translation'] = translations
214
+ else:
215
+ df['translation'] = df['text']
216
+ print("translations done")
217
+
218
+ return (df)
219
+
220
+
221
+ def create_srt_and_burn(df, video_in):
222
+
223
+ print("Starting creation of video wit srt")
224
+ print("video in path is:")
225
+ print(video_in)
226
+
227
+
228
+ with open('testi.srt','w', encoding="utf-8") as file:
229
+ for i in range(len(df)):
230
+ file.write(str(i+1))
231
+ file.write('\n')
232
+ start = df.iloc[i]['start']
233
+
234
+
235
+
236
+ file.write(f"{start}")
237
+
238
+ stop = df.iloc[i]['end']
239
+
240
+
241
+ file.write(' --> ')
242
+ file.write(f"{stop}")
243
+ file.write('\n')
244
+ file.writelines(df.iloc[i]['translation'])
245
+ if int(i) != len(df)-1:
246
+ file.write('\n\n')
247
+
248
+ print("SRT DONE")
249
+ try:
250
+ file1 = open('./testi.srt', 'r', encoding="utf-8")
251
+ Lines = file1.readlines()
252
+
253
+ count = 0
254
+ # Strips the newline character
255
+ for line in Lines:
256
+ count += 1
257
+ print("{}".format(line))
258
+
259
+ print(type(video_in))
260
+ print(video_in)
261
+
262
+ video_out = video_in.replace('.mp4', '_out.mp4')
263
+ print("video_out_path")
264
+ print(video_out)
265
+ command = 'ffmpeg -i "{}" -y -vf subtitles=./testi.srt "{}"'.format(video_in, video_out)
266
+ print(command)
267
+ os.system(command)
268
+ return video_out
269
+ except Exception as e:
270
+ print(e)
271
+ return video_out
272
+
273
+
274
+ # ---- Gradio Layout -----
275
+ video_in = gr.Video(label="Video file", mirror_webcam=False)
276
+ youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
277
+ video_out = gr.Video(label="Video Out", mirror_webcam=False)
278
+
279
+
280
+
281
+ df_init = pd.DataFrame(columns=['start','end','text'])
282
+ df_init_2 = pd.DataFrame(columns=['start','end','text','translation'])
283
+ selected_translation_lang = gr.Dropdown(choices=translation_models_list, type="value", value="English", label="In which language you want the transcriptions?", interactive=True)
284
+
285
+ selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="Let the model analyze", label="Spoken language in video", interactive=True)
286
+ selected_source_lang_2 = gr.Dropdown(choices=source_language_list_2, type="value", value="English", label="Spoken language in video", interactive=True)
287
+ selected_translation_lang_2 = gr.Dropdown(choices=translation_models_list, type="value", value="English", label="In which language you want the transcriptions?", interactive=True)
288
+ selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="base", label="Selected Whisper model", interactive=True)
289
+
290
+ transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
291
+ transcription_and_translation_df = gr.DataFrame(value=df_init_2,label="Transcription and translation dataframe", max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
292
+
293
+
294
+ demo = gr.Blocks(css='''
295
+ #cut_btn, #reset_btn { align-self:stretch; }
296
+ #\\31 3 { max-width: 540px; }
297
+ .output-markdown {max-width: 65ch !important;}
298
+ ''')
299
+ demo.encrypt = False
300
+ with demo:
301
+ transcription_var = gr.Variable()
302
+
303
+ with gr.Row():
304
+ with gr.Column():
305
+ gr.Markdown('''
306
+ ### This space allows you to:
307
+ ##### 1. Download youtube video with a given URL
308
+ ##### 2. Watch it in the first video component
309
+ ##### 3. Run automatic speech recognition on the video using Whisper (Please remember to select translation language)
310
+ ##### 4. Translate the recognized transcriptions to Finnish, Swedish, Danish
311
+ ##### 5. Burn the translations to the original video and watch the video in the 2nd video component
312
+ ''')
313
+
314
+ with gr.Column():
315
+ gr.Markdown('''
316
+ ### 1. Insert Youtube URL below (Some examples below which I suggest to use for first tests)
317
+ ##### 1. https://www.youtube.com/watch?v=nlMuHtV82q8&ab_channel=NothingforSale24
318
+ ##### 2. https://www.youtube.com/watch?v=JzPfMbG1vrE&ab_channel=ExplainerVideosByLauren
319
+ ##### 3. https://www.youtube.com/watch?v=S68vvV0kod8&ab_channel=Pearl-CohnTelevision
320
+ ''')
321
+
322
+ with gr.Row():
323
+ with gr.Column():
324
+ youtube_url_in.render()
325
+ download_youtube_btn = gr.Button("Step 1. Download Youtube video")
326
+ download_youtube_btn.click(get_youtube, [youtube_url_in], [
327
+ video_in])
328
+ print(video_in)
329
+
330
+
331
+ with gr.Row():
332
+ with gr.Column():
333
+ video_in.render()
334
+ with gr.Column():
335
+ gr.Markdown('''
336
+ ##### Here you can start the transcription and translation process.
337
+ ##### Be aware that processing will last for a while (35 second video took around 20 seconds in my testing and might fail for longer videos)
338
+ ''')
339
+ selected_source_lang.render()
340
+ selected_whisper_model.render()
341
+ transcribe_btn = gr.Button("Step 2. Transcribe audio")
342
+ transcribe_btn.click(speech_to_text, [video_in, selected_source_lang, selected_whisper_model], transcription_df)
343
+
344
+
345
+ with gr.Row():
346
+ gr.Markdown('''
347
+ ##### Here you will get transcription output
348
+ ##### ''')
349
+
350
+ with gr.Row():
351
+ with gr.Column():
352
+ transcription_df.render()
353
+
354
+ with gr.Row():
355
+ with gr.Column():
356
+ gr.Markdown('''
357
+ ##### Here you will get translated transcriptions.
358
+ ##### Please remember to select Spoken Language and wanted translation language
359
+ ##### ''')
360
+ selected_source_lang_2.render()
361
+ selected_translation_lang_2.render()
362
+ translate_transcriptions_button = gr.Button("Step 3. Translate transcription")
363
+ translate_transcriptions_button.click(translate_transcriptions, [transcription_df, selected_translation_lang_2, selected_source_lang_2], transcription_and_translation_df)
364
+ transcription_and_translation_df.render()
365
+
366
+ with gr.Row():
367
+ with gr.Column():
368
+ gr.Markdown('''
369
+ ##### Now press the Step 4. Button to create output video with translated transcriptions
370
+ ##### ''')
371
+ translate_and_make_srt_btn = gr.Button("Step 4. Create and burn srt to video")
372
+ print(video_in)
373
+ translate_and_make_srt_btn.click(create_srt_and_burn, [transcription_and_translation_df,video_in], [
374
+ video_out])
375
+ video_out.render()
376
+
377
+
378
+ demo.launch()