RasmusToivanen commited on
Commit
c2ae77e
β€’
1 Parent(s): b8abbf2
README.md CHANGED
@@ -1,13 +1,45 @@
1
  ---
2
  title: Fin Eng ASR Autosubtitles
3
- emoji: 😻
4
- colorFrom: green
5
  colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 3.0.26
8
  app_file: app.py
9
  pinned: false
10
- license: apache-2.0
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: Fin Eng ASR Autosubtitles
3
+ emoji: 🌍
4
+ colorFrom: indigo
5
  colorTo: yellow
6
  sdk: gradio
7
+ sdk_version: 3.0.24
8
  app_file: app.py
9
  pinned: false
10
+ license: mit
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
+
15
+ We use Opus-MT models in the code. Here is the citations
16
+ ```
17
+ @inproceedings{tiedemann-thottingal-2020-opus,
18
+ title = "{OPUS}-{MT} {--} Building open translation services for the World",
19
+ author = {Tiedemann, J{\"o}rg and Thottingal, Santhosh},
20
+ booktitle = "Proceedings of the 22nd Annual Conference of the European Association for Machine Translation",
21
+ month = nov,
22
+ year = "2020",
23
+ address = "Lisboa, Portugal",
24
+ publisher = "European Association for Machine Translation",
25
+ url = "https://aclanthology.org/2020.eamt-1.61",
26
+ pages = "479--480",
27
+ }
28
+ @inproceedings{tiedemann-2020-tatoeba,
29
+ title = "The Tatoeba Translation Challenge {--} Realistic Data Sets for Low Resource and Multilingual {MT}",
30
+ author = {Tiedemann, J{\"o}rg},
31
+ booktitle = "Proceedings of the Fifth Conference on Machine Translation",
32
+ month = nov,
33
+ year = "2020",
34
+ address = "Online",
35
+ publisher = "Association for Computational Linguistics",
36
+ url = "https://aclanthology.org/2020.wmt-1.139",
37
+ pages = "1174--1182",
38
+ }
39
+
40
+ Wav2vec2:
41
+ BAEVSKI, Alexei, et al. wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in Neural Information Processing Systems, 2020, 33: 12449-12460.
42
+
43
+ T5:
44
+ RAFFEL, Colin, et al. Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res., 2020, 21.140: 1-67.
45
+ ```
app.py ADDED
@@ -0,0 +1,321 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ from difflib import Differ
4
+ import ffmpeg
5
+ import os
6
+ from pathlib import Path
7
+ import time
8
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
9
+ from transformers import MarianMTModel, MarianTokenizer
10
+ import pandas as pd
11
+ import re
12
+ import time
13
+ import os
14
+ from fuzzywuzzy import fuzz
15
+ from fastT5 import export_and_get_onnx_model
16
+ import torch
17
+ from transformers import pipeline
18
+
19
+ MODEL = "Finnish-NLP/wav2vec2-large-uralic-voxpopuli-v2-finnish"
20
+ marian_nmt_model = "Helsinki-NLP/opus-mt-tc-big-fi-en"
21
+ tokenizer_marian = MarianTokenizer.from_pretrained(marian_nmt_model)
22
+ model = MarianMTModel.from_pretrained(marian_nmt_model)
23
+
24
+ cuda = torch.device(
25
+ 'cuda:0') if torch.cuda.is_available() else torch.device('cpu')
26
+ sr_pipeline_device = 0 if torch.cuda.is_available() else -1
27
+
28
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
29
+ speech_recognizer = pipeline(
30
+ task="automatic-speech-recognition",
31
+ model=f'{MODEL}',
32
+ tokenizer=f'{MODEL}',
33
+ framework="pt",
34
+ device=sr_pipeline_device,
35
+ )
36
+
37
+ model_checkpoint = 'Finnish-NLP/t5-small-nl24-casing-punctuation-correction'
38
+ tokenizer_t5 = AutoTokenizer.from_pretrained(model_checkpoint)
39
+ model_t5 = export_and_get_onnx_model(model_checkpoint)
40
+ #model_t5 = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, from_flax=False, torch_dtype=torch.float32).to(device)
41
+
42
+
43
+
44
+
45
+ videos_out_path = Path("./videos_out")
46
+ videos_out_path.mkdir(parents=True, exist_ok=True)
47
+
48
+ samples_data = sorted(Path('examples').glob('*.json'))
49
+ SAMPLES = []
50
+ for file in samples_data:
51
+ with open(file) as f:
52
+ sample = json.load(f)
53
+ SAMPLES.append(sample)
54
+ VIDEOS = list(map(lambda x: [x['video']], SAMPLES))
55
+
56
+ total_inferences_since_reboot = 0
57
+ total_cuts_since_reboot = 0
58
+
59
+
60
+
61
+
62
+ async def speech_to_text(video_file_path):
63
+ """
64
+ Takes a video path to convert to audio, transcribe audio channel to text timestamps
65
+
66
+ Using https://huggingface.co/tasks/automatic-speech-recognition pipeline
67
+ """
68
+ global total_inferences_since_reboot
69
+ if(video_file_path == None):
70
+ raise ValueError("Error no video input")
71
+
72
+ video_path = Path(video_file_path)
73
+
74
+ try:
75
+ # convert video to audio 16k using PIPE to audio_memory
76
+ audio_memory, _ = ffmpeg.input(video_path).output(
77
+ '-', format="wav", ac=1, ar='16k').overwrite_output().global_args('-loglevel', 'quiet').run(capture_stdout=True)
78
+ except Exception as e:
79
+ raise RuntimeError("Error converting video to audio")
80
+
81
+ last_time = time.time()
82
+
83
+ try:
84
+
85
+ output = speech_recognizer(
86
+ audio_memory, return_timestamps="word", chunk_length_s=10, stride_length_s=(4, 2))
87
+
88
+ transcription = output["text"].lower()
89
+
90
+ timestamps = [[chunk["text"].lower(), chunk["timestamp"][0], chunk["timestamp"][1]]
91
+ for chunk in output['chunks']]
92
+ input_ids = tokenizer_t5(transcription, return_tensors="pt").input_ids.to(device)
93
+ outputs = model_t5.generate(input_ids, max_length=128)
94
+ case_corrected_text = tokenizer_t5.decode(outputs[0], skip_special_tokens=True)
95
+ translated = model.generate(**tokenizer_marian([case_corrected_text], return_tensors="pt", padding=True))
96
+ translated_plain = "".join([tokenizer_marian.decode(t, skip_special_tokens=True) for t in translated])
97
+
98
+ for timestamp in timestamps:
99
+ total_inferences_since_reboot += 1
100
+
101
+
102
+ df = pd.DataFrame(timestamps, columns = ['word', 'start','stop'])
103
+
104
+ df['start'] = df['start'].astype('float16')
105
+ df['stop'] = df['stop'].astype('float16')
106
+
107
+
108
+ print("\n\ntotal_inferences_since_reboot: ",
109
+ total_inferences_since_reboot, "\n\n")
110
+ return (transcription, transcription, timestamps,df, case_corrected_text, translated_plain)
111
+ except Exception as e:
112
+ raise RuntimeError("Error Running inference with local model", e)
113
+
114
+
115
+ def create_srt(text_out_t5, df):
116
+
117
+ df.columns = ['word', 'start', 'stop']
118
+
119
+ df_sentences = pd.DataFrame(columns=['sentence','start','stop','translated'])
120
+ found_match_value = 0
121
+ found_match_word = ""
122
+
123
+ t5_sentences = re.split('[.]|[?]|[!]', text_out_t5)
124
+ t5_sentences = [sentence.replace('.','').replace('?','').replace('!','') for sentence in t5_sentences if sentence]
125
+
126
+ for i, sentence in enumerate(t5_sentences):
127
+ sentence = sentence.lower().split(" ")
128
+ if i == 0:
129
+ df_subset = df[df['stop'] <10]
130
+ start = df.iloc[0]['start']
131
+
132
+ for j, word in enumerate(df_subset['word']):
133
+ temp_value = fuzz.partial_ratio((word), sentence[-1])
134
+ if temp_value > found_match_value:
135
+ found_match_value = temp_value
136
+ found_match_word = word
137
+
138
+ stop = df_subset[df_subset['word'] == found_match_word]
139
+
140
+ translated = model.generate(**tokenizer_marian(t5_sentences[i], return_tensors="pt", padding=True))
141
+ translated_plain = [tokenizer_marian.decode(t, skip_special_tokens=True) for t in translated]
142
+
143
+ dict_to_add = {
144
+ 'sentence': t5_sentences[i],
145
+ 'start': start,
146
+ 'stop': stop.iloc[0]['stop'],
147
+ 'translated': translated_plain[0]
148
+ }
149
+
150
+ df_sentences = df_sentences.append(dict_to_add, ignore_index=True)
151
+ new_start = df.iloc[stop.index.values[0]+1]['start']
152
+ new_stop = new_start + 10
153
+ else:
154
+ found_match_value = 0
155
+ found_match_word = ""
156
+
157
+ df_subset = df[(df['start'] >= new_start) & (df['stop'] <= new_stop)]
158
+ start = df_subset.iloc[0]['start']
159
+
160
+ for j, word in enumerate(df_subset['word']):
161
+ temp_value = fuzz.partial_ratio((word), sentence[-1])
162
+ if temp_value > found_match_value:
163
+ found_match_value = temp_value
164
+ found_match_word = word
165
+ stop = df_subset[df_subset['word'] == found_match_word]
166
+
167
+
168
+ translated = model.generate(**tokenizer_marian(t5_sentences[i], return_tensors="pt", padding=True))
169
+ translated_plain = [tokenizer_marian.decode(t, skip_special_tokens=True) for t in translated]
170
+
171
+
172
+ dict_to_add = {
173
+ 'sentence': t5_sentences[i],
174
+ 'start': start,
175
+ 'stop': stop.iloc[0]['stop'],
176
+ 'translated': translated_plain[0]
177
+ }
178
+ df_sentences = df_sentences.append(dict_to_add, ignore_index=True)
179
+ try:
180
+ new_start = df.iloc[stop.index.values[0]+1]['start']
181
+ new_stop = new_start + 10
182
+ except Exception as e:
183
+ df_sentences = df_sentences.iloc[0:i+1]
184
+
185
+ return df_sentences
186
+
187
+ def create_srt_and_burn(video_in, srt_sentences):
188
+ srt_sentences.columns = ['sentence', 'start', 'stop','translated']
189
+ srt_sentences.dropna(inplace=True)
190
+ srt_sentences['start'] = srt_sentences['start'].astype('float')
191
+ srt_sentences['stop'] = srt_sentences['stop'].astype('float')
192
+
193
+
194
+ with open('testi.srt','w') as file:
195
+ for i in range(len(srt_sentences)):
196
+ file.write(str(i+1))
197
+ file.write('\n')
198
+ start = (time.strftime('%H:%M:%S', time.gmtime(srt_sentences.iloc[i]['start'])))
199
+ if "." in str(srt_sentences.iloc[i]['start']):
200
+ if len(str(srt_sentences.iloc[i]['start']).split('.')[1]) > 3:
201
+ start = start + '.' + str(srt_sentences.iloc[i]['start']).split('.')[1][:3]
202
+ else:
203
+ start = start + '.' + str(srt_sentences.iloc[i]['start']).split('.')[1]
204
+ file.write(start)
205
+ stop = (time.strftime('%H:%M:%S', time.gmtime(srt_sentences.iloc[i]['stop'])))
206
+ if len(str(srt_sentences.iloc[i]['stop']).split('.')[1]) > 3:
207
+ stop = stop + '.' + str(srt_sentences.iloc[i]['stop']).split('.')[1][:3]
208
+ else:
209
+ stop = stop + '.' + str(srt_sentences.iloc[i]['stop']).split('.')[1]
210
+ file.write(' --> ')
211
+ file.write(stop)
212
+ file.write('\n')
213
+ file.writelines(srt_sentences.iloc[i]['translated'])
214
+ if int(i) != len(srt_sentences)-1:
215
+ file.write('\n\n')
216
+ try:
217
+ file1 = open('./testi.srt', 'r')
218
+ Lines = file1.readlines()
219
+
220
+ count = 0
221
+ # Strips the newline character
222
+ for line in Lines:
223
+ count += 1
224
+
225
+
226
+
227
+ video_out = str(Path(video_in)).replace('.mp4', '_out.mp4')
228
+ command = "ffmpeg -i {} -y -vf subtitles=./testi.srt {}".format(Path(video_in), Path(video_out))
229
+ os.system(command)
230
+ return video_out
231
+ except Exception as e:
232
+ print(e)
233
+ return video_out
234
+
235
+
236
+ # ---- Gradio Layout -----
237
+ video_in = gr.Video(label="Video file", interactive=True)
238
+ text_in = gr.Textbox(label="Transcription", lines=10, interactive=True)
239
+ text_out_t5 = gr.Textbox(label="Transcription T5", lines=10, interactive=True)
240
+ translation_out = gr.Textbox(label="Translation", lines=10, interactive=True)
241
+ text_out_timestamps = gr.Textbox(label="Word level timestamps", lines=10, interactive=True)
242
+ srt_sentences = gr.DataFrame(label="Srt lines", row_count=(0, "dynamic"))
243
+ video_out = gr.Video(label="Video Out")
244
+ diff_out = gr.HighlightedText(label="Cuts Diffs", combine_adjacent=True)
245
+ examples = gr.components.Dataset(
246
+ components=[video_in], samples=VIDEOS, type="index")
247
+
248
+ demo = gr.Blocks(enable_queue=True, css='''
249
+ #cut_btn, #reset_btn { align-self:stretch; }
250
+ #\\31 3 { max-width: 540px; }
251
+ .output-markdown {max-width: 65ch !important;}
252
+ ''')
253
+ demo.encrypt = False
254
+ with demo:
255
+ transcription_var = gr.Variable()
256
+ timestamps_var = gr.Variable()
257
+ timestamps_df = gr.Dataframe(visible=False, row_count=(0, "dynamic"))
258
+ with gr.Row():
259
+ with gr.Column():
260
+ gr.Markdown('''
261
+ # Create videos with English subtitles from videos spoken in Finnish
262
+ This project is a quick proof of concept of a simple video editor where you can add English subtitles to Finnish videos.
263
+ This space currently only works for short videos (Up to 128 tokens) but will be improved in next versions.
264
+ Space uses our finetuned Finnish ASR models, Our pretrained + finetuned Finnish T5 model for casing+punctuation correction and Opus-MT models from Helsinki University for Finnish --> English translation.
265
+ This space was inspired by https://huggingface.co/spaces/radames/edit-video-by-editing-text
266
+ ''')
267
+
268
+ with gr.Row():
269
+
270
+ examples.render()
271
+
272
+ def load_example(id):
273
+ video = SAMPLES[id]['video']
274
+ transcription = ''
275
+ timestamps = SAMPLES[id]['timestamps']
276
+
277
+ return (video, transcription, transcription, timestamps)
278
+
279
+ examples.click(
280
+ load_example,
281
+ inputs=[examples],
282
+ outputs=[video_in, text_in, transcription_var, timestamps_var],
283
+ queue=False)
284
+ with gr.Row():
285
+ with gr.Column():
286
+ video_in.render()
287
+ transcribe_btn = gr.Button("1. Press here to transcribe Audio")
288
+ transcribe_btn.click(speech_to_text, [video_in], [
289
+ text_in, transcription_var, text_out_timestamps,timestamps_df, text_out_t5, translation_out])
290
+
291
+ with gr.Row():
292
+ gr.Markdown('''
293
+ ### Here you will get varying outputs from different parts of the processing
294
+ ASR model output, T5 model output which corrects casing + hyphenation, sentence level translations and word level timestamps''')
295
+
296
+ with gr.Row():
297
+ with gr.Column():
298
+ text_in.render()
299
+ with gr.Column():
300
+ text_out_t5.render()
301
+ with gr.Column():
302
+ translation_out.render()
303
+ with gr.Column():
304
+ text_out_timestamps.render()
305
+ with gr.Row():
306
+ with gr.Column():
307
+ translate_and_make_srt_btn = gr.Button("2. Press here to create rows for subtitles")
308
+ translate_and_make_srt_btn.click(create_srt, [text_out_t5, timestamps_df], [
309
+ srt_sentences])
310
+ with gr.Row():
311
+ with gr.Column():
312
+ srt_sentences.render()
313
+ with gr.Row():
314
+ with gr.Column():
315
+ translate_and_make_srt_btn = gr.Button("3. Press here to create subtitle file and insert translations to video")
316
+ translate_and_make_srt_btn.click(create_srt_and_burn, [video_in, srt_sentences], [
317
+ video_out])
318
+ video_out.render()
319
+
320
+ if __name__ == "__main__":
321
+ demo.launch(debug=True)
examples/.gitattributes ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ eka.mp4 filter=lfs diff=lfs merge=lfs -text
2
+ toka.mp4 filter=lfs diff=lfs merge=lfs -text
3
+ kolmas.mp4 filter=lfs diff=lfs merge=lfs -text
examples/video_1.json ADDED
@@ -0,0 +1 @@
 
1
+ {"video":"./examples/video_1.mp4", "transcription": "", "timestamps": []}
examples/video_1.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2274caa70e7be8994aa0b2e6c29eface3817f53d5e37d3f3984f95e5460dd4f
3
+ size 31346388
examples/video_2.json ADDED
@@ -0,0 +1 @@
 
1
+ {"video":"./examples/video_2.mp4", "transcription": "", "timestamps": []}
examples/video_2.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e0ffb151623c1978af61e1a476fae4385deba658427b005ceb907bd95106eb2
3
+ size 32746315
packages.txt ADDED
@@ -0,0 +1 @@
 
1
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ gradio==3.0.24
4
+ datasets
5
+ librosa
6
+ ffmpeg-python
7
+ python-dotenv
8
+ pandas
9
+ fuzzywuzzy
10
+ python-Levenshtein
11
+ sentencepiece
12
+ protobuf
13
+ pyctcdecode
14
+ https://github.com/kpu/kenlm/archive/master.zip
15
+ sacremoses
16
+ fastt5