Raphael commited on
Commit
720b03b
1 Parent(s): 1538088

Signed-off-by: Raphael <oOraph@users.noreply.github.com>

Files changed (4) hide show
  1. .gitignore +1 -0
  2. app.py +252 -0
  3. packages.txt +1 -0
  4. requirements.txt +15 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
app.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import math
3
+ import os
4
+ import shutil
5
+ import time
6
+
7
+ from datasets import load_dataset
8
+ import gradio as gr
9
+ import moviepy.editor as mp
10
+ import numpy as np
11
+ import pysrt
12
+ import torch
13
+ from transformers import pipeline
14
+ import yt_dlp
15
+
16
+
17
+ os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = '1'
18
+
19
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', force=True)
20
+
21
+ LOG = logging.getLogger(__name__)
22
+ CLIP_SECONDS = 20
23
+ SLICES = 4
24
+ SLICE_DURATION = CLIP_SECONDS / SLICES
25
+ # At most 6 mins
26
+ MAX_CHUNKS = 45
27
+ BASEDIR = '/tmp/processed'
28
+
29
+ os.makedirs(BASEDIR, exist_ok=True)
30
+
31
+ asr_kwargs = {
32
+ "task": "automatic-speech-recognition",
33
+ "model": "openai/whisper-medium.en"
34
+ }
35
+
36
+ translator_kwargs = {
37
+ "task": "translation_en_to_fr",
38
+ "model": "Helsinki-NLP/opus-mt-en-fr"
39
+ }
40
+
41
+ summarizer_kwargs = {
42
+ "task": "summarization",
43
+ "model": "facebook/bart-large-cnn"
44
+ }
45
+
46
+ if torch.cuda.is_available():
47
+ LOG.info("GPU available")
48
+
49
+ asr_kwargs['device'] = 'cuda:0'
50
+ translator_kwargs['device'] = 'cuda:0'
51
+ summarizer_kwargs['device'] = 'cuda:0'
52
+
53
+ # All three models should fit together on a single T4 GPU
54
+
55
+ LOG.info("Fetching ASR model from the Hub if not already there")
56
+ asr = pipeline(**asr_kwargs)
57
+
58
+ LOG.info("Fetching translation model from the Hub if not already there")
59
+ translator = pipeline(**translator_kwargs)
60
+
61
+ LOG.info("Fetching summarization model from the Hub if not already there")
62
+ summarizer = pipeline(**summarizer_kwargs)
63
+
64
+
65
+ def demo(url: str, translate: bool):
66
+ basedir = BASEDIR
67
+ video_path, video = download(url, os.path.join(basedir, 'video.mp4'))
68
+ audio_clips(video, basedir)
69
+ srt_file, summary = process_video(basedir, video.duration, translate)
70
+ return summary, srt_file, [video_path, srt_file]
71
+
72
+
73
+ def download(url, dst):
74
+ LOG.info("Downloading provided url %s", url)
75
+
76
+ opts = {
77
+ 'skip_download': False,
78
+ 'overwrites': True,
79
+ 'format': 'mp4',
80
+ 'outtmpl': {'default': dst}
81
+ }
82
+
83
+ with yt_dlp.YoutubeDL(opts) as dl:
84
+ dl.download([url])
85
+
86
+ return dst, mp.VideoFileClip(dst)
87
+
88
+
89
+ def audiodir(basedir):
90
+ return os.path.join(basedir, 'audio')
91
+
92
+
93
+ def audio_clips(video: mp.VideoFileClip, basedir: str):
94
+
95
+ LOG.info("Building audio clips")
96
+
97
+ clips_dir = audiodir(basedir)
98
+ shutil.rmtree(clips_dir, ignore_errors=True)
99
+ os.makedirs(clips_dir, exist_ok=True)
100
+
101
+ audio = video.audio
102
+ end = audio.duration
103
+
104
+ digits = int(math.log(end / CLIP_SECONDS, 10)) + 1
105
+
106
+ for idx, i in enumerate(range(0, int(end), CLIP_SECONDS)):
107
+ sub_end = min(i+CLIP_SECONDS, end)
108
+ # print(sub_end)
109
+ sub_clip = audio.subclip(t_start=i, t_end=sub_end)
110
+ audio_file = os.path.join(clips_dir, f"audio_{idx:0{digits}d}" + ".ogg")
111
+ # audio_file = os.path.join(AUDIO_CLIPS, "audio_" + str(idx))
112
+ sub_clip.write_audiofile(audio_file, fps=16000)
113
+
114
+
115
+ def process_video(basedir: str, duration, translate: bool):
116
+ audio_dir = audiodir(basedir)
117
+ transcriptions = transcription(audio_dir, duration)
118
+ subs = translation(transcriptions, translate)
119
+ srt_file = build_srt_clips(subs, basedir)
120
+ summary = summarize(transcriptions, translate)
121
+ return srt_file, summary
122
+
123
+
124
+ def transcription(audio_dir: str, duration):
125
+ LOG.info("Audio transcription")
126
+ # Not exact, nvm, doesn't need to be
127
+ chunks = int(duration / CLIP_SECONDS + 1)
128
+ chunks = min(chunks, MAX_CHUNKS)
129
+
130
+ LOG.debug("Loading audio clips dataset")
131
+
132
+ dataset = load_dataset("audiofolder", data_dir=audio_dir)
133
+ dataset = dataset['train']
134
+ dataset = dataset['audio'][0:chunks]
135
+
136
+ start = time.time()
137
+ transcriptions = []
138
+ for i, d in enumerate(np.array_split(dataset, 5)):
139
+ d = list(d)
140
+ LOG.info("ASR batch %d / 5, samples %d", i, len(d))
141
+ t = asr(d, max_new_tokens=10000)
142
+ transcriptions.extend(t)
143
+
144
+ transcriptions = [t['text'] for t in transcriptions]
145
+ elapsed = time.time() - start
146
+ LOG.info("Transcription done, elapsed %.2f seconds", elapsed)
147
+ return transcriptions
148
+
149
+
150
+ def translation(transcriptions, translate):
151
+ if translate:
152
+ LOG.info("Performing translation")
153
+ start = time.time()
154
+ translations = translator(transcriptions)
155
+ translations = [t['translation_text'] for t in translations]
156
+ elapsed = time.time() - start
157
+ LOG.info("Translation done, elapsed %.2f seconds", elapsed)
158
+ else:
159
+ translations = transcriptions
160
+ return translations
161
+
162
+
163
+ def summarize(transcriptions, translate):
164
+ LOG.info("Generating video summary")
165
+ whole_text = ' '.join(transcriptions).strip()
166
+ word_count = len(whole_text.split())
167
+ summary = summarizer(whole_text)
168
+ # min_length=word_count // 4 + 1,
169
+ # max_length=word_count // 2 + 1)
170
+ summary = translation([summary[0]['summary_text']], translate)[0]
171
+ return summary
172
+
173
+
174
+ def subs_to_timed_segments(subtitles: list[str]):
175
+ LOG.info("Building srt segments")
176
+ all_chunks = []
177
+ for sub in subtitles:
178
+ chunks = np.array_split(sub.split(' '), SLICES)
179
+ all_chunks.extend(chunks)
180
+
181
+ subs = []
182
+ for c in all_chunks:
183
+ c = ' '.join(c)
184
+ subs.append(c)
185
+
186
+ segments = []
187
+ for i, c in enumerate(subs):
188
+ segments.append({
189
+ 'text': c.strip(),
190
+ 'start': i * SLICE_DURATION,
191
+ 'end': (i + 1) * SLICE_DURATION
192
+ })
193
+
194
+ return segments
195
+
196
+
197
+ def build_srt_clips(subs, basedir):
198
+
199
+ LOG.info("Generating subtitles")
200
+ segments = subs_to_timed_segments(subs)
201
+
202
+ LOG.info("Building srt clips")
203
+ max_text_len = 30
204
+ subtitles = pysrt.SubRipFile()
205
+ first = True
206
+ for segment in segments:
207
+ start = segment['start'] * 1000
208
+ if first:
209
+ start += 3000
210
+ first = False
211
+ end = segment['end'] * 1000
212
+ text = segment['text']
213
+ text = text.strip()
214
+ if len(text) < max_text_len:
215
+ o = pysrt.SubRipItem()
216
+ o.start = pysrt.SubRipTime(0, 0, 0, start)
217
+ o.end = pysrt.SubRipTime(0, 0, 0, end)
218
+ o.text = text
219
+ subtitles.append(o)
220
+ else:
221
+ # Just split in two, should be ok in most cases
222
+ words = text.split()
223
+ o = pysrt.SubRipItem()
224
+ o.text = ' '.join(words[0:len(words)//2])
225
+ o.start = pysrt.SubRipTime(0, 0, 0, start)
226
+ chkpt = (start + end) / 2
227
+ o.end = pysrt.SubRipTime(0, 0, 0, chkpt)
228
+ subtitles.append(o)
229
+ o = pysrt.SubRipItem()
230
+ o.text = ' '.join(words[len(words)//2:])
231
+ o.start = pysrt.SubRipTime(0, 0, 0, chkpt)
232
+ o.end = pysrt.SubRipTime(0, 0, 0, end)
233
+ subtitles.append(o)
234
+
235
+ srt_path = os.path.join(basedir, 'video.srt')
236
+ subtitles.save(srt_path, encoding='utf-8')
237
+ LOG.info("Subtitles saved in srt file %s", srt_path)
238
+ return srt_path
239
+
240
+
241
+ iface = gr.Interface(
242
+ fn=demo,
243
+ inputs=[
244
+ gr.Text(value="https://youtu.be/tiZFewofSLM", label="English video url"),
245
+ gr.Checkbox(value=True, label='Translate to French')],
246
+ outputs=[
247
+ gr.Text(label="Video summary"),
248
+ gr.File(label="SRT file"),
249
+ gr.Video(label="Video with subtitles"),
250
+ ])
251
+
252
+ iface.launch()
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ imagemagick
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ jupyter
2
+ notebook
3
+ numpy
4
+ torch
5
+ transformers
6
+ hf_transfer
7
+ moviepy
8
+ yt-dlp
9
+ datasets
10
+ soundfile
11
+ librosa
12
+ sentencepiece
13
+ pysrt
14
+ gradio
15
+ sacremoses