Karwasze RASMUS commited on
Commit
48b9dd3
0 Parent(s):

Duplicate from Finnish-NLP/Whisper-ASR-youtube-subtitles

Browse files

Co-authored-by: TOIVANEN <RASMUS@users.noreply.huggingface.co>

Files changed (5) hide show
  1. .gitattributes +31 -0
  2. README.md +14 -0
  3. app.py +271 -0
  4. packages.txt +1 -0
  5. requirements.txt +16 -0
.gitattributes ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.model filter=lfs diff=lfs merge=lfs -text
11
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
12
+ *.npy filter=lfs diff=lfs merge=lfs -text
13
+ *.npz filter=lfs diff=lfs merge=lfs -text
14
+ *.onnx filter=lfs diff=lfs merge=lfs -text
15
+ *.ot filter=lfs diff=lfs merge=lfs -text
16
+ *.parquet filter=lfs diff=lfs merge=lfs -text
17
+ *.pickle filter=lfs diff=lfs merge=lfs -text
18
+ *.pkl filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pt filter=lfs diff=lfs merge=lfs -text
21
+ *.pth filter=lfs diff=lfs merge=lfs -text
22
+ *.rar filter=lfs diff=lfs merge=lfs -text
23
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
24
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
25
+ *.tflite filter=lfs diff=lfs merge=lfs -text
26
+ *.tgz filter=lfs diff=lfs merge=lfs -text
27
+ *.wasm filter=lfs diff=lfs merge=lfs -text
28
+ *.xz filter=lfs diff=lfs merge=lfs -text
29
+ *.zip filter=lfs diff=lfs merge=lfs -text
30
+ *.zst filter=lfs diff=lfs merge=lfs -text
31
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Whisper ASR Youtube subtitles creator
3
+ emoji: 👁
4
+ colorFrom: yellow
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 3.9
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ duplicated_from: Finnish-NLP/Whisper-ASR-youtube-subtitles
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ from pathlib import Path
4
+ import time
5
+
6
+ import pandas as pd
7
+ import re
8
+ import time
9
+ import os
10
+
11
+ import whisper
12
+ from pytube import YouTube
13
+
14
+ import psutil
15
+ num_cores = psutil.cpu_count()
16
+ os.environ["OMP_NUM_THREADS"] = f"{num_cores}"
17
+
18
+
19
+ import torch
20
+
21
+
22
+ # is cuda available?
23
+
24
+ from easynmt import EasyNMT
25
+ translation_model = EasyNMT('m2m_100_418M', max_new_tokens=60, max_length=60)
26
+
27
+ asr_model = whisper.load_model("base")
28
+ transcribe_options = dict(beam_size=3, best_of=3, without_timestamps=False)
29
+
30
+ translation_models = {
31
+ "Finnish": "fi",
32
+ "Swedish": "sv",
33
+ "Danish": "da",
34
+ "English": "en",
35
+ "German": "de"
36
+ }
37
+
38
+
39
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
40
+ print("DEVICE IS: ")
41
+ print(device)
42
+
43
+ videos_out_path = Path("./videos_out")
44
+ videos_out_path.mkdir(parents=True, exist_ok=True)
45
+
46
+ def get_youtube(video_url):
47
+ yt = YouTube(video_url)
48
+ abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download()
49
+ print("LADATATTU POLKUUN")
50
+ print(abs_video_path)
51
+
52
+ return abs_video_path
53
+
54
+ async def speech_to_text(video_file_path, selected_translation_lang):
55
+ """
56
+ # Youtube with translated subtitles using OpenAI Whisper and Opus-MT models.
57
+ # Currently supports only English audio
58
+ This space allows you to:
59
+ 1. Download youtube video with a given url
60
+ 2. Watch it in the first video component
61
+ 3. Run automatic speech recognition on the video using Whisper
62
+ 4. Translate the recognized transcriptions to Finnish, Swedish, Danish, English, German (More languages coming later)
63
+ 5. Burn the translations to the original video and watch the video in the 2nd video component
64
+
65
+ Speech Recognition is based on OpenAI Whisper https://github.com/openai/whisper
66
+ """
67
+
68
+ if(video_file_path == None):
69
+ raise ValueError("Error no video input")
70
+ print(video_file_path)
71
+ try:
72
+ audio = whisper.load_audio(video_file_path)
73
+ except Exception as e:
74
+ raise RuntimeError("Error converting video to audio")
75
+
76
+ last_time = time.time()
77
+
78
+ try:
79
+ print(f'Transcribing via local model')
80
+ transcribe_options = dict(beam_size=5, best_of=5, without_timestamps=False)
81
+
82
+ transcription = asr_model.transcribe(audio, **transcribe_options)
83
+
84
+
85
+ #translation_options = dict(language=selected_translation_lang, beam_size=5, best_of=5, without_timestamps=False)
86
+ #translations = asr_model.transcribe(audio, **translation_options)
87
+
88
+ df = pd.DataFrame(columns=['start','end','text'])
89
+
90
+
91
+
92
+ for i,segment in enumerate(transcription['segments']):
93
+ new_row = {'start': segment['start'],
94
+ 'end': segment['end'],
95
+ 'text': segment['text']
96
+ }
97
+ df = df.append(new_row, ignore_index=True)
98
+
99
+ if selected_translation_lang is None:
100
+ selected_translation_lang = 'Finnish'
101
+
102
+ sentences = df['text']
103
+ df['translation'] = translation_model.translate(sentences, target_lang=translation_models.get(selected_translation_lang))
104
+
105
+
106
+ print('After translation to target language \n')
107
+
108
+ return (df)
109
+ except Exception as e:
110
+ raise RuntimeError("Error Running inference with local model", e)
111
+
112
+
113
+ def create_srt_and_burn(df, video_in):
114
+
115
+ print("Starting creation of video wit srt")
116
+
117
+
118
+ with open('testi.srt','w', encoding="utf-8") as file:
119
+ for i in range(len(df)):
120
+ file.write(str(i+1))
121
+ file.write('\n')
122
+ start = df.iloc[i]['start']
123
+
124
+
125
+ milliseconds = round(start * 1000.0)
126
+
127
+ hours = milliseconds // 3_600_000
128
+ milliseconds -= hours * 3_600_000
129
+
130
+ minutes = milliseconds // 60_000
131
+ milliseconds -= minutes * 60_000
132
+
133
+ seconds = milliseconds // 1_000
134
+ milliseconds -= seconds * 1_000
135
+
136
+ file.write(f"{hours}:{minutes:02d}:{seconds:02d}.{milliseconds:03d}")
137
+
138
+ stop = df.iloc[i]['end']
139
+
140
+
141
+ milliseconds = round(stop * 1000.0)
142
+
143
+ hours = milliseconds // 3_600_000
144
+ milliseconds -= hours * 3_600_000
145
+
146
+ minutes = milliseconds // 60_000
147
+ milliseconds -= minutes * 60_000
148
+
149
+ seconds = milliseconds // 1_000
150
+ milliseconds -= seconds * 1_000
151
+
152
+
153
+ file.write(' --> ')
154
+ file.write(f"{hours}:{minutes:02d}:{seconds:02d}.{milliseconds:03d}")
155
+ file.write('\n')
156
+ file.writelines(df.iloc[i]['translation'])
157
+ if int(i) != len(df)-1:
158
+ file.write('\n\n')
159
+
160
+ print("SRT DONE")
161
+ try:
162
+ file1 = open('./testi.srt', 'r', encoding="utf-8")
163
+ Lines = file1.readlines()
164
+
165
+ count = 0
166
+ # Strips the newline character
167
+ for line in Lines:
168
+ count += 1
169
+ print("{}".format(line))
170
+
171
+ print(type(video_in))
172
+ print(video_in)
173
+
174
+ video_out = video_in.replace('.mp4', '_out.mp4')
175
+ print(video_out)
176
+ command = 'ffmpeg -i "{}" -y -vf subtitles=./testi.srt "{}"'.format(video_in, video_out)
177
+ print(command)
178
+ os.system(command)
179
+ return video_out
180
+ except Exception as e:
181
+ print(e)
182
+ return video_out
183
+
184
+
185
+ # ---- Gradio Layout -----
186
+ video_in = gr.Video(label="Video file", mirror_webcam=False)
187
+ youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
188
+ video_out = gr.Video(label="Video Out", mirror_webcam=False)
189
+
190
+
191
+ df_init = pd.DataFrame(columns=['start','end','text','translation'])
192
+ selected_translation_lang = gr.Dropdown(choices=["English", "German","Finnish","Swedish", "Danish"], type="value", value="English", label="Language to translate transcriptions to", interactive=True)
193
+
194
+ transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10)
195
+
196
+
197
+ demo = gr.Blocks(css='''
198
+ #cut_btn, #reset_btn { align-self:stretch; }
199
+ #\\31 3 { max-width: 540px; }
200
+ .output-markdown {max-width: 65ch !important;}
201
+ ''')
202
+ demo.encrypt = False
203
+ with demo:
204
+ transcription_var = gr.Variable()
205
+
206
+ with gr.Row():
207
+ with gr.Column():
208
+ gr.Markdown('''
209
+ ### This space allows you to:
210
+ ##### 1. Download youtube video with a given URL
211
+ ##### 2. Watch it in the first video component
212
+ ##### 3. Run automatic speech recognition on the video using Whisper (Please remember to select translation language)
213
+ ##### 4. Translate the recognized transcriptions to English, Finnish, Swedish, Danish and German
214
+ ##### 5. Burn the translations to the original video and watch the video in the 2nd video component
215
+ ''')
216
+
217
+ with gr.Column():
218
+ gr.Markdown('''
219
+ ### 1. Insert Youtube URL below (Some examples below which I suggest to use for first tests)
220
+ ##### 1. https://www.youtube.com/watch?v=nlMuHtV82q8&ab_channel=NothingforSale24
221
+ ##### 2. https://www.youtube.com/watch?v=JzPfMbG1vrE&ab_channel=ExplainerVideosByLauren
222
+ ##### 3. https://www.youtube.com/watch?v=S68vvV0kod8&ab_channel=Pearl-CohnTelevision
223
+ ''')
224
+
225
+ with gr.Row():
226
+ with gr.Column():
227
+ youtube_url_in.render()
228
+ download_youtube_btn = gr.Button("Step 1. Download Youtube video")
229
+ download_youtube_btn.click(get_youtube, [youtube_url_in], [
230
+ video_in])
231
+ print(video_in)
232
+
233
+
234
+ with gr.Row():
235
+ with gr.Column():
236
+ video_in.render()
237
+ with gr.Column():
238
+ gr.Markdown('''
239
+ ##### Here you can start the transcription and translation process.
240
+ ##### Be aware that processing will last for a while (35 second video took around 20 seconds in my testing)
241
+ ''')
242
+ transcribe_btn = gr.Button("Step 2. Transcribe and translate audio")
243
+
244
+ transcribe_btn.click(speech_to_text, [video_in, selected_translation_lang], transcription_df)
245
+
246
+ with gr.Row():
247
+ with gr.Column():
248
+ selected_translation_lang.render()
249
+
250
+ with gr.Row():
251
+ gr.Markdown('''
252
+ ##### Here you will get transcription and translation output
253
+ ##### If you see error please remember to select translation language
254
+ ##### ''')
255
+
256
+ with gr.Row():
257
+ with gr.Column():
258
+ transcription_df.render()
259
+
260
+ with gr.Row():
261
+ with gr.Column():
262
+ translate_and_make_srt_btn = gr.Button("Step 3. Create and burn srt to video")
263
+ print(video_in)
264
+ translate_and_make_srt_btn.click(create_srt_and_burn, [transcription_df,video_in], [
265
+ video_out])
266
+ video_out.render()
267
+
268
+
269
+ if __name__ == "__main__":
270
+ demo.launch(debug=True)
271
+
packages.txt ADDED
@@ -0,0 +1 @@
 
1
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.85.0
2
+ ffmpeg-python==0.2.0
3
+ gradio==3.9
4
+ matplotlib==3.6.1
5
+ pandas==1.5.0
6
+ pytube==12.1.0
7
+ sacremoses==0.0.53
8
+ sentencepiece==0.1.97
9
+ tokenizers==0.12.1
10
+ torch==1.12.1
11
+ torchaudio==0.12.1
12
+ tqdm==4.64.1
13
+ EasyNMT==2.0.2
14
+ transformers==4.22.2
15
+ whisper @ git+https://github.com/openai/whisper.git
16
+ psutil==5.9.2