RASMUS commited on
Commit
7ed286f
1 Parent(s): 797b8a0

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +367 -0
app.py ADDED
@@ -0,0 +1,367 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ from pathlib import Path
4
+ import time
5
+
6
+ import pandas as pd
7
+ import re
8
+ import time
9
+ import os
10
+
11
+ import whisper
12
+ from pytube import YouTube
13
+
14
+ import psutil
15
+ num_cores = psutil.cpu_count()
16
+ os.environ["OMP_NUM_THREADS"] = f"{num_cores}"
17
+
18
+
19
+ import torch
20
+
21
+
22
+ # is cuda available?
23
+
24
+ from easynmt import EasyNMT
25
+ translation_model = EasyNMT('m2m_100_418M', max_new_tokens=60)
26
+
27
+ asr_model = whisper.load_model("base")
28
+ transcribe_options = dict(beam_size=3, best_of=3, without_timestamps=False)
29
+
30
+ translation_models = {
31
+ "Afrikaans":"af",
32
+ "Amharic":"am",
33
+ "Arabic":"ar",
34
+ "Asturian ":"st",
35
+ "Azerbaijani":"az",
36
+ "Bashkir":"ba",
37
+ "Belarusian":"be",
38
+ "Bulgarian":"bg",
39
+ "Bengali":"bn",
40
+ "Breton":"br",
41
+ "Bosnian":"bs",
42
+ "Catalan; Valencian":"ca",
43
+ "Cebuano":"eb",
44
+ "Czech":"cs",
45
+ "Welsh":"cy",
46
+ "Danish":"da",
47
+ "German":"de",
48
+ "Greeek":"el",
49
+ "English":"en",
50
+ "Spanish":"es",
51
+ "Estonian":"et",
52
+ "Persian":"fa",
53
+ "Fulah":"ff",
54
+ "Finnish":"fi",
55
+ "French":"fr",
56
+ "Western Frisian":"fy",
57
+ "Irish":"ga",
58
+ "Gaelic; Scottish Gaelic":"gd",
59
+ "Galician":"gl",
60
+ "Gujarati":"gu",
61
+ "Hausa":"ha",
62
+ "Hebrew":"he",
63
+ "Hindi":"hi",
64
+ "Croatian":"hr",
65
+ "Haitian; Haitian Creole":"ht",
66
+ "Hungarian":"hu",
67
+ "Armenian":"hy",
68
+ "Indonesian":"id",
69
+ "Igbo":"ig",
70
+ "Iloko":"lo",
71
+ "Icelandic":"is",
72
+ "Italian":"it",
73
+ "Japanese":"ja",
74
+ "Javanese":"jv",
75
+ "Georgian":"ka",
76
+ "Kazakh":"kk",
77
+ "Central Khmer":"km",
78
+ "Kannada":"kn",
79
+ "Korean":"ko",
80
+ "Luxembourgish; Letzeburgesch":"lb",
81
+ "Ganda":"lg",
82
+ "Lingala":"ln",
83
+ "Lao":"lo",
84
+ "Lithuanian":"lt",
85
+ "Latvian":"lv",
86
+ "Malagasy":"mg",
87
+ "Macedonian":"mk",
88
+ "Malayalam":"ml",
89
+ "Mongolian":"mn",
90
+ "Marathi":"mr",
91
+ "Malay":"ms",
92
+ "Burmese":"my",
93
+ "Nepali":"ne",
94
+ "Dutch; Flemish":"nl",
95
+ "Norwegian":"no",
96
+ "Northern Sotho":"ns",
97
+ "Occitan (post 1500)":"oc",
98
+ "Oriya":"or",
99
+ "Panjabi; Punjabi":"pa",
100
+ "Polish":"pl",
101
+ "Pushto; Pashto":"ps",
102
+ "Portuguese":"pt",
103
+ "Romanian; Moldavian; Moldovan":"ro",
104
+ "Russian":"ru",
105
+ "Sindhi":"sd",
106
+ "Sinhala; Sinhalese":"si",
107
+ "Slovak":"sk",
108
+ "Slovenian":"sl",
109
+ "Somali":"so",
110
+ "Albanian":"sq",
111
+ "Serbian":"sr",
112
+ "Swati":"ss",
113
+ "Sundanese":"su",
114
+ "Swedish":"sv",
115
+ "Swahili":"sw",
116
+ "Tamil":"ta",
117
+ "Thai":"th",
118
+ "Tagalog":"tl",
119
+ "Tswana":"tn",
120
+ "Turkish":"tr",
121
+ "Ukrainian":"uk",
122
+ "Urdu":"ur",
123
+ "Uzbek":"uz",
124
+ "Vietnamese":"vi",
125
+ "Wolof":"wo",
126
+ "Xhosa":"xh",
127
+ "Yiddish":"yi",
128
+ "Yoruba":"yo",
129
+ "Chinese":"zh",
130
+ "Zulu":"zu"
131
+ }
132
+
133
+ translation_models_list = [key[0] for key in translation_models.items()]
134
+
135
+
136
+ device = "cpu"#torch.device("cuda" if torch.cuda.is_available() else "cpu")
137
+ print("DEVICE IS: ")
138
+ print(device)
139
+
140
+ videos_out_path = Path("./videos_out")
141
+ videos_out_path.mkdir(parents=True, exist_ok=True)
142
+
143
+ def get_youtube(video_url):
144
+ yt = YouTube(video_url)
145
+ abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download()
146
+ print("LADATATTU POLKUUN")
147
+ print(abs_video_path)
148
+
149
+ return abs_video_path
150
+
151
+ async def speech_to_text(video_file_path, selected_translation_lang):
152
+ """
153
+ # Youtube with translated subtitles using OpenAI Whisper and Opus-MT models.
154
+ # Currently supports only English audio
155
+ This space allows you to:
156
+ 1. Download youtube video with a given url
157
+ 2. Watch it in the first video component
158
+ 3. Run automatic speech recognition on the video using Whisper
159
+ 4. Translate the recognized transcriptions to Finnish, Swedish, Danish
160
+ 5. Burn the translations to the original video and watch the video in the 2nd video component
161
+
162
+ Speech Recognition is based on OpenAI Whisper https://github.com/openai/whisper
163
+ """
164
+
165
+ if(video_file_path == None):
166
+ raise ValueError("Error no video input")
167
+ print(video_file_path)
168
+ try:
169
+ audio = whisper.load_audio(video_file_path)
170
+ except Exception as e:
171
+ raise RuntimeError("Error converting video to audio")
172
+
173
+ last_time = time.time()
174
+
175
+ try:
176
+ print(f'Transcribing via local model')
177
+ transcribe_options = dict(beam_size=5, best_of=5, without_timestamps=False)
178
+
179
+ transcription = asr_model.transcribe(audio, **transcribe_options)
180
+
181
+
182
+ #translation_options = dict(language=selected_translation_lang, beam_size=5, best_of=5, without_timestamps=False)
183
+ #translations = asr_model.transcribe(audio, **translation_options)
184
+
185
+ df = pd.DataFrame(columns=['start','end','text'])
186
+
187
+
188
+
189
+ for i,segment in enumerate(transcription['segments']):
190
+ new_row = {'start': segment['start'],
191
+ 'end': segment['end'],
192
+ 'text': segment['text']
193
+ }
194
+ df = df.append(new_row, ignore_index=True)
195
+
196
+ if selected_translation_lang is None:
197
+ selected_translation_lang = 'Finnish'
198
+
199
+ sentences = df['text']
200
+ df['translation'] = translation_model.translate(sentences, target_lang=translation_models.get(selected_translation_lang), max_new_tokens = 50)
201
+
202
+
203
+ print('After translation to target language \n')
204
+
205
+ return (df)
206
+ except Exception as e:
207
+ raise RuntimeError("Error Running inference with local model", e)
208
+
209
+
210
+ def create_srt_and_burn(df, video_in):
211
+
212
+ print("Starting creation of video wit srt")
213
+
214
+
215
+ with open('testi.srt','w', encoding="utf-8") as file:
216
+ for i in range(len(df)):
217
+ file.write(str(i+1))
218
+ file.write('\n')
219
+ start = df.iloc[i]['start']
220
+
221
+
222
+ milliseconds = round(start * 1000.0)
223
+
224
+ hours = milliseconds // 3_600_000
225
+ milliseconds -= hours * 3_600_000
226
+
227
+ minutes = milliseconds // 60_000
228
+ milliseconds -= minutes * 60_000
229
+
230
+ seconds = milliseconds // 1_000
231
+ milliseconds -= seconds * 1_000
232
+
233
+ file.write(f"{hours}:{minutes:02d}:{seconds:02d}.{milliseconds:03d}")
234
+
235
+ stop = df.iloc[i]['end']
236
+
237
+
238
+ milliseconds = round(stop * 1000.0)
239
+
240
+ hours = milliseconds // 3_600_000
241
+ milliseconds -= hours * 3_600_000
242
+
243
+ minutes = milliseconds // 60_000
244
+ milliseconds -= minutes * 60_000
245
+
246
+ seconds = milliseconds // 1_000
247
+ milliseconds -= seconds * 1_000
248
+
249
+
250
+ file.write(' --> ')
251
+ file.write(f"{hours}:{minutes:02d}:{seconds:02d}.{milliseconds:03d}")
252
+ file.write('\n')
253
+ file.writelines(df.iloc[i]['translation'])
254
+ if int(i) != len(df)-1:
255
+ file.write('\n\n')
256
+
257
+ print("SRT DONE")
258
+ try:
259
+ file1 = open('./testi.srt', 'r', encoding="utf-8")
260
+ Lines = file1.readlines()
261
+
262
+ count = 0
263
+ # Strips the newline character
264
+ for line in Lines:
265
+ count += 1
266
+ print("{}".format(line))
267
+
268
+ print(type(video_in))
269
+ print(video_in)
270
+
271
+ video_out = video_in.replace('.mp4', '_out.mp4')
272
+ print(video_out)
273
+ command = 'ffmpeg -i "{}" -y -vf subtitles=./testi.srt "{}"'.format(video_in, video_out)
274
+ print(command)
275
+ os.system(command)
276
+ return video_out
277
+ except Exception as e:
278
+ print(e)
279
+ return video_out
280
+
281
+
282
+ # ---- Gradio Layout -----
283
+ video_in = gr.Video(label="Video file", mirror_webcam=False)
284
+ youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
285
+ video_out = gr.Video(label="Video Out", mirror_webcam=False)
286
+
287
+
288
+ df_init = pd.DataFrame(columns=['start','end','text','translation'])
289
+ selected_translation_lang = gr.Dropdown(choices=translation_models_list, type="value", value="English", label="Language to translate transcriptions to", interactive=True)
290
+
291
+ transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10)
292
+
293
+
294
+ demo = gr.Blocks(css='''
295
+ #cut_btn, #reset_btn { align-self:stretch; }
296
+ #\\31 3 { max-width: 540px; }
297
+ .output-markdown {max-width: 65ch !important;}
298
+ ''')
299
+ demo.encrypt = False
300
+ with demo:
301
+ transcription_var = gr.Variable()
302
+
303
+ with gr.Row():
304
+ with gr.Column():
305
+ gr.Markdown('''
306
+ ### This space allows you to:
307
+ ##### 1. Download youtube video with a given URL
308
+ ##### 2. Watch it in the first video component
309
+ ##### 3. Run automatic speech recognition on the video using Whisper (Please remember to select translation language)
310
+ ##### 4. Translate the recognized transcriptions to Finnish, Swedish, Danish
311
+ ##### 5. Burn the translations to the original video and watch the video in the 2nd video component
312
+ ''')
313
+
314
+ with gr.Column():
315
+ gr.Markdown('''
316
+ ### 1. Insert Youtube URL below (Some examples below which I suggest to use for first tests)
317
+ ##### 1. https://www.youtube.com/watch?v=nlMuHtV82q8&ab_channel=NothingforSale24
318
+ ##### 2. https://www.youtube.com/watch?v=JzPfMbG1vrE&ab_channel=ExplainerVideosByLauren
319
+ ##### 3. https://www.youtube.com/watch?v=S68vvV0kod8&ab_channel=Pearl-CohnTelevision
320
+ ''')
321
+
322
+ with gr.Row():
323
+ with gr.Column():
324
+ youtube_url_in.render()
325
+ download_youtube_btn = gr.Button("Step 1. Download Youtube video")
326
+ download_youtube_btn.click(get_youtube, [youtube_url_in], [
327
+ video_in])
328
+ print(video_in)
329
+
330
+
331
+ with gr.Row():
332
+ with gr.Column():
333
+ video_in.render()
334
+ with gr.Column():
335
+ gr.Markdown('''
336
+ ##### Here you can start the transcription and translation process.
337
+ ##### Be aware that processing will last for a while (35 second video took around 20 seconds in my testing)
338
+ ''')
339
+ transcribe_btn = gr.Button("Step 2. Transcribe and translate audio")
340
+
341
+ transcribe_btn.click(speech_to_text, [video_in, selected_translation_lang], transcription_df)
342
+
343
+ with gr.Row():
344
+ with gr.Column():
345
+ selected_translation_lang.render()
346
+
347
+ with gr.Row():
348
+ gr.Markdown('''
349
+ ##### Here you will get transcription and translation output
350
+ ##### If you see error please remember to select translation language
351
+ ##### ''')
352
+
353
+ with gr.Row():
354
+ with gr.Column():
355
+ transcription_df.render()
356
+
357
+ with gr.Row():
358
+ with gr.Column():
359
+ translate_and_make_srt_btn = gr.Button("Step 3. Create and burn srt to video")
360
+ print(video_in)
361
+ translate_and_make_srt_btn.click(create_srt_and_burn, [transcription_df,video_in], [
362
+ video_out])
363
+ video_out.render()
364
+
365
+
366
+ if __name__ == "__main__":
367
+ demo.queue().launch(debug=True, share=False, enable_queue=True)