radames commited on
Commit
e1e093e
1 Parent(s): 6936ddf
Files changed (4) hide show
  1. .gitignore +2 -1
  2. app.py +131 -234
  3. pre-requirements.txt +1 -0
  4. requirements.txt +8 -2
.gitignore CHANGED
@@ -1,3 +1,4 @@
 
1
  videos_out/
2
  results/
3
  # Python build
@@ -45,4 +46,4 @@ workspace.code-workspace
45
  # log files
46
  .pnpm-debug.log
47
  venv/
48
- *.db-journal
 
1
+ gradio_cached_examples/
2
  videos_out/
3
  results/
4
  # Python build
 
46
  # log files
47
  .pnpm-debug.log
48
  venv/
49
+ *.db-journal
app.py CHANGED
@@ -1,45 +1,46 @@
1
  import gradio as gr
2
  import json
3
- from difflib import Differ
4
  import ffmpeg
5
  import os
6
  from pathlib import Path
7
  import time
8
- import aiohttp
9
- import asyncio
10
-
11
-
12
- # Set true if you're using huggingface inference API API https://huggingface.co/inference-api
13
- API_BACKEND = True
14
- # MODEL = 'facebook/wav2vec2-large-960h-lv60-self'
15
- # MODEL = "facebook/wav2vec2-large-960h"
16
- MODEL = "facebook/wav2vec2-base-960h"
17
- # MODEL = "patrickvonplaten/wav2vec2-large-960h-lv60-self-4-gram"
18
- if API_BACKEND:
19
- from dotenv import load_dotenv
20
- import base64
21
- import asyncio
22
- load_dotenv(Path(".env"))
23
-
24
- HF_TOKEN = os.environ["HF_TOKEN"]
25
- headers = {"Authorization": f"Bearer {HF_TOKEN}"}
26
- API_URL = f'https://api-inference.huggingface.co/models/{MODEL}'
27
-
28
- else:
29
- import torch
30
- from transformers import pipeline
31
-
32
- # is cuda available?
33
- cuda = torch.device(
34
- 'cuda:0') if torch.cuda.is_available() else torch.device('cpu')
35
- device = 0 if torch.cuda.is_available() else -1
36
- speech_recognizer = pipeline(
37
- task="automatic-speech-recognition",
38
- model=f'{MODEL}',
39
- tokenizer=f'{MODEL}',
40
- framework="pt",
41
- device=device,
42
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  videos_out_path = Path("./videos_out")
45
  videos_out_path.mkdir(parents=True, exist_ok=True)
@@ -52,124 +53,58 @@ for file in samples_data:
52
  SAMPLES.append(sample)
53
  VIDEOS = list(map(lambda x: [x['video']], SAMPLES))
54
 
55
- total_inferences_since_reboot = 415
56
- total_cuts_since_reboot = 1539
57
-
58
 
59
- async def speech_to_text(video_file_path):
60
  """
61
  Takes a video path to convert to audio, transcribe audio channel to text and char timestamps
62
 
63
  Using https://huggingface.co/tasks/automatic-speech-recognition pipeline
64
  """
65
- global total_inferences_since_reboot
66
- if (video_file_path == None):
67
- raise ValueError("Error no video input")
68
 
69
- video_path = Path(video_file_path)
70
  try:
71
  # convert video to audio 16k using PIPE to audio_memory
72
  audio_memory, _ = ffmpeg.input(video_path).output(
73
- '-', format="wav", ac=1, ar='16k').overwrite_output().global_args('-loglevel', 'quiet').run(capture_stdout=True)
74
  except Exception as e:
75
  raise RuntimeError("Error converting video to audio")
76
 
77
- ping("speech_to_text")
78
- last_time = time.time()
79
- if API_BACKEND:
80
- # Using Inference API https://huggingface.co/inference-api
81
- # try twice, because the model must be loaded
82
- for i in range(10):
83
- for tries in range(4):
84
- print(f'Transcribing from API attempt {tries}')
85
- try:
86
- inference_reponse = await query_api(audio_memory)
87
- print(inference_reponse)
88
- transcription = inference_reponse["text"].lower()
89
- timestamps = [[chunk["text"].lower(), chunk["timestamp"][0], chunk["timestamp"][1]]
90
- for chunk in inference_reponse['chunks']]
91
-
92
- total_inferences_since_reboot += 1
93
- print("\n\ntotal_inferences_since_reboot: ",
94
- total_inferences_since_reboot, "\n\n")
95
- return (transcription, transcription, timestamps)
96
- except Exception as e:
97
- print(e)
98
- if 'error' in inference_reponse and 'estimated_time' in inference_reponse:
99
- wait_time = inference_reponse['estimated_time']
100
- print("Waiting for model to load....", wait_time)
101
- # wait for loading model
102
- # 5 seconds plus for certanty
103
- await asyncio.sleep(wait_time + 5.0)
104
- elif 'error' in inference_reponse:
105
- raise RuntimeError("Error Fetching API",
106
- inference_reponse['error'])
107
- else:
108
- break
109
- else:
110
- raise RuntimeError(inference_reponse, "Error Fetching API")
111
- else:
112
-
113
- try:
114
- print(f'Transcribing via local model')
115
- output = speech_recognizer(
116
- audio_memory, return_timestamps="char", chunk_length_s=10, stride_length_s=(4, 2))
117
-
118
- transcription = output["text"].lower()
119
- timestamps = [[chunk["text"].lower(), chunk["timestamp"][0].tolist(), chunk["timestamp"][1].tolist()]
120
- for chunk in output['chunks']]
121
- total_inferences_since_reboot += 1
122
-
123
- print("\n\ntotal_inferences_since_reboot: ",
124
- total_inferences_since_reboot, "\n\n")
125
- return (transcription, transcription, timestamps)
126
- except Exception as e:
127
- raise RuntimeError("Error Running inference with local model", e)
128
 
129
 
130
- async def cut_timestamps_to_video(video_in, transcription, text_in, timestamps):
131
- """
132
- Given original video input, text transcript + timestamps,
133
- and edit ext cuts video segments into a single video
134
- """
135
- global total_cuts_since_reboot
136
-
137
- video_path = Path(video_in)
138
- video_file_name = video_path.stem
139
- if (video_in == None or text_in == None or transcription == None):
140
  raise ValueError("Inputs undefined")
141
 
142
- d = Differ()
143
- # compare original transcription with edit text
144
- diff_chars = d.compare(transcription, text_in)
145
- # remove all text aditions from diff
146
- filtered = list(filter(lambda x: x[0] != '+', diff_chars))
147
-
148
- # filter timestamps to be removed
149
- # timestamps_to_cut = [b for (a,b) in zip(filtered, timestamps_var) if a[0]== '-' ]
150
- # return diff tokes and cutted video!!
151
-
152
- # groupping character timestamps so there are less cuts
153
- idx = 0
154
- grouped = {}
155
- for (a, b) in zip(filtered, timestamps):
156
- if a[0] != '-':
157
- if idx in grouped:
158
- grouped[idx].append(b)
159
- else:
160
- grouped[idx] = []
161
- grouped[idx].append(b)
162
- else:
163
- idx += 1
164
 
165
- # after grouping, gets the lower and upter start and time for each group
166
- timestamps_to_cut = [[v[0][1], v[-1][2]] for v in grouped.values()]
 
167
 
168
  between_str = '+'.join(
169
  map(lambda t: f'between(t,{t[0]},{t[1]})', timestamps_to_cut))
170
 
171
  if timestamps_to_cut:
172
- video_file = ffmpeg.input(video_in)
173
  video = video_file.video.filter(
174
  "select", f'({between_str})').filter("setpts", "N/FRAME_RATE/TB")
175
  audio = video_file.audio.filter(
@@ -179,124 +114,52 @@ async def cut_timestamps_to_video(video_in, transcription, text_in, timestamps):
179
  ffmpeg.concat(video, audio, v=1, a=1).output(
180
  output_video).overwrite_output().global_args('-loglevel', 'quiet').run()
181
  else:
182
- output_video = video_in
183
-
184
- tokens = [(token[2:], token[0] if token[0] != " " else None)
185
- for token in filtered]
186
 
187
- total_cuts_since_reboot += 1
188
- ping("video_cuts")
189
- print("\n\ntotal_cuts_since_reboot: ", total_cuts_since_reboot, "\n\n")
190
- return (tokens, output_video)
191
 
192
 
193
- async def query_api(audio_bytes: bytes):
194
- """
195
- Query for Huggingface Inference API for Automatic Speech Recognition task
196
- """
197
- payload = json.dumps({
198
- "inputs": base64.b64encode(audio_bytes).decode("utf-8"),
199
- "parameters": {
200
- "return_timestamps": "char",
201
- "chunk_length_s": 10,
202
- "stride_length_s": [4, 2]
203
- },
204
- "options": {"use_gpu": False}
205
- }).encode("utf-8")
206
- async with aiohttp.ClientSession() as session:
207
- async with session.post(API_URL, headers=headers, data=payload) as response:
208
- print("API Response: ", response.status)
209
- if response.headers['Content-Type'] == 'application/json':
210
- return await response.json()
211
- elif response.headers['Content-Type'] == 'application/octet-stream':
212
- return await response.read()
213
- elif response.headers['Content-Type'] == 'text/plain':
214
- return await response.text()
215
- else:
216
- raise RuntimeError("Error Fetching API")
217
-
218
-
219
- def ping(name):
220
- url = f'https://huggingface.co/api/telemetry/spaces/radames/edit-video-by-editing-text/{name}'
221
- print("ping: ", url)
222
-
223
- async def req():
224
- async with aiohttp.ClientSession() as session:
225
- async with session.get(url) as response:
226
- print("pong: ", response.status)
227
- asyncio.create_task(req())
228
-
229
-
230
- # ---- Gradio Layout -----
231
- video_in = gr.Video(label="Video file")
232
- text_in = gr.Textbox(label="Transcription", lines=10, interactive=True)
233
- video_out = gr.Video(label="Video Out")
234
- diff_out = gr.HighlightedText(label="Cuts Diffs", combine_adjacent=True)
235
- examples = gr.Dataset(components=[video_in], samples=VIDEOS, type="index")
236
-
237
- css = """
238
- #cut_btn, #reset_btn { align-self:stretch; }
239
- #\\31 3 { max-width: 540px; }
240
- .output-markdown {max-width: 65ch !important;}
241
- """
242
- with gr.Blocks(css=css) as demo:
243
- transcription_var = gr.Variable()
244
- timestamps_var = gr.Variable()
245
  with gr.Row():
246
  with gr.Column():
247
  gr.Markdown("""
248
- # Edit Video By Editing Text
249
- This project is a quick proof of concept of a simple video editor where the edits
250
- are made by editing the audio transcription.
251
  Using the [Huggingface Automatic Speech Recognition Pipeline](https://huggingface.co/tasks/automatic-speech-recognition)
252
- with a fine tuned [Wav2Vec2 model using Connectionist Temporal Classification (CTC)](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self)
253
- you can predict not only the text transcription but also the [character or word base timestamps](https://huggingface.co/docs/transformers/v4.19.2/en/main_classes/pipelines#transformers.AutomaticSpeechRecognitionPipeline.__call__.return_timestamps)
254
  """)
255
 
256
- with gr.Row():
257
-
258
- examples.render()
259
-
260
- def load_example(id):
261
- video = SAMPLES[id]['video']
262
- transcription = SAMPLES[id]['transcription'].lower()
263
- timestamps = SAMPLES[id]['timestamps']
264
-
265
- return (video, transcription, transcription, timestamps)
266
-
267
- examples.click(
268
- load_example,
269
- inputs=[examples],
270
- outputs=[video_in, text_in, transcription_var, timestamps_var],
271
- queue=False)
272
  with gr.Row():
273
  with gr.Column():
274
- video_in.render()
275
- transcribe_btn = gr.Button("Transcribe Audio")
276
- transcribe_btn.click(speech_to_text, [video_in], [
277
- text_in, transcription_var, timestamps_var])
 
 
 
278
 
279
- with gr.Row():
280
- gr.Markdown("""
281
- ### Now edit as text
282
- After running the video transcription, you can make cuts to the text below (only cuts, not additions!)""")
283
-
284
- with gr.Row():
285
  with gr.Column():
286
- text_in.render()
 
287
  with gr.Row():
288
- cut_btn = gr.Button("Cut to video", elem_id="cut_btn")
289
- # send audio path and hidden variables
290
- cut_btn.click(cut_timestamps_to_video, [
291
- video_in, transcription_var, text_in, timestamps_var], [diff_out, video_out])
292
-
293
- reset_transcription = gr.Button(
294
- "Reset to last trascription", elem_id="reset_btn")
295
- reset_transcription.click(
296
- lambda x: x, transcription_var, text_in)
297
- with gr.Column():
298
- video_out.render()
299
- diff_out.render()
 
 
 
300
  with gr.Row():
301
  gr.Markdown("""
302
  #### Video Credits
@@ -305,6 +168,40 @@ with gr.Blocks(css=css) as demo:
305
  1. [Shia LaBeouf "Just Do It"](https://www.youtube.com/watch?v=n2lTxIk_Dr0)
306
  1. [Mark Zuckerberg & Yuval Noah Harari in Conversation](https://www.youtube.com/watch?v=Boj9eD0Wug8)
307
  """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
  demo.queue()
309
  if __name__ == "__main__":
310
  demo.launch(debug=True)
 
1
  import gradio as gr
2
  import json
 
3
  import ffmpeg
4
  import os
5
  from pathlib import Path
6
  import time
7
+ from transformers import pipeline
8
+ import torch
9
+
10
+ # checkpoint = "openai/whisper-tiny"
11
+ # checkpoint = "openai/whisper-base"
12
+ checkpoint = "openai/whisper-small"
13
+
14
+ if torch.cuda.is_available() and torch.cuda.device_count() > 0:
15
+ from transformers import (
16
+ AutomaticSpeechRecognitionPipeline,
17
+ WhisperForConditionalGeneration,
18
+ WhisperProcessor,
19
+ )
20
+ model = WhisperForConditionalGeneration.from_pretrained(
21
+ checkpoint).to("cuda").half()
22
+ processor = WhisperProcessor.from_pretrained(checkpoint)
23
+ pipe = AutomaticSpeechRecognitionPipeline(
24
+ model=model,
25
+ tokenizer=processor.tokenizer,
26
+ feature_extractor=processor.feature_extractor,
27
+ batch_size=8,
28
+ torch_dtype=torch.float16,
29
+ device="cuda:0"
 
 
 
 
 
 
 
 
 
 
 
30
  )
31
+ else:
32
+ pipe = pipeline(model=checkpoint)
33
+
34
+
35
+ # TODO: no longer need to set these manually once the models have been updated on the Hub
36
+ # whisper-tiny
37
+ # pipe.model.generation_config.alignment_heads = [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]]
38
+ # whisper-base
39
+ # pipe.model.generation_config.alignment_heads = [[3, 1], [4, 2], [4, 3], [4, 7], [5, 1], [5, 2], [5, 4], [5, 6]]
40
+ # whisper-small
41
+ pipe.model.generation_config.alignment_heads = [[5, 3], [5, 9], [
42
+ 8, 0], [8, 4], [8, 7], [8, 8], [9, 0], [9, 7], [9, 9], [10, 5]]
43
+
44
 
45
  videos_out_path = Path("./videos_out")
46
  videos_out_path.mkdir(parents=True, exist_ok=True)
 
53
  SAMPLES.append(sample)
54
  VIDEOS = list(map(lambda x: [x['video']], SAMPLES))
55
 
 
 
 
56
 
57
+ async def speech_to_text(video_in):
58
  """
59
  Takes a video path to convert to audio, transcribe audio channel to text and char timestamps
60
 
61
  Using https://huggingface.co/tasks/automatic-speech-recognition pipeline
62
  """
63
+ video_in = video_in[0] if isinstance(video_in, list) else video_in
64
+ if (video_in == None):
65
+ raise ValueError("Video input undefined")
66
 
67
+ video_path = Path(video_in.name)
68
  try:
69
  # convert video to audio 16k using PIPE to audio_memory
70
  audio_memory, _ = ffmpeg.input(video_path).output(
71
+ '-', format="wav", ac=1, ar=pipe.feature_extractor.sampling_rate).overwrite_output().global_args('-loglevel', 'quiet').run(capture_stdout=True)
72
  except Exception as e:
73
  raise RuntimeError("Error converting video to audio")
74
 
75
+ try:
76
+ print(f'Transcribing via local model')
77
+ output = pipe(audio_memory, chunk_length_s=10,
78
+ stride_length_s=[4, 2], return_timestamps="word")
79
+ transcription = output["text"]
80
+ chunks = output["chunks"]
81
+ timestamps_var = [{"word": chunk["text"], "timestamp":(
82
+ chunk["timestamp"][0], chunk["timestamp"][1]), "state": True} for chunk in chunks]
83
+
84
+ words = [(word['word'], '+' if word['state'] else '-')
85
+ for word in timestamps_var]
86
+ return (words, transcription, timestamps_var, video_in.name)
87
+ except Exception as e:
88
+ raise RuntimeError("Error Running inference with local model", e)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
 
91
+ async def cut_timestamps_to_video(video_in, timestamps_var):
92
+ video_in = video_in[0] if isinstance(video_in, list) else video_in
93
+ if (video_in == None or timestamps_var == None):
 
 
 
 
 
 
 
94
  raise ValueError("Inputs undefined")
95
 
96
+ video_path = Path(video_in.name)
97
+ video_file_name = video_path.stem
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
+ timestamps_to_cut = [
100
+ (timestamps_var[i]['timestamp'][0], timestamps_var[i]['timestamp'][1])
101
+ for i in range(len(timestamps_var)) if timestamps_var[i]['state']]
102
 
103
  between_str = '+'.join(
104
  map(lambda t: f'between(t,{t[0]},{t[1]})', timestamps_to_cut))
105
 
106
  if timestamps_to_cut:
107
+ video_file = ffmpeg.input(video_path)
108
  video = video_file.video.filter(
109
  "select", f'({between_str})').filter("setpts", "N/FRAME_RATE/TB")
110
  audio = video_file.audio.filter(
 
114
  ffmpeg.concat(video, audio, v=1, a=1).output(
115
  output_video).overwrite_output().global_args('-loglevel', 'quiet').run()
116
  else:
117
+ output_video = video_path
 
 
 
118
 
119
+ return output_video
 
 
 
120
 
121
 
122
+ with gr.Blocks() as demo:
123
+ transcription_var = gr.State()
124
+ timestamps_var = gr.State()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  with gr.Row():
126
  with gr.Column():
127
  gr.Markdown("""
128
+ # Whisper: Word-Level Video Trimming
129
+ Quick edit a video by trimming out words.
 
130
  Using the [Huggingface Automatic Speech Recognition Pipeline](https://huggingface.co/tasks/automatic-speech-recognition)
131
+ with [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)
 
132
  """)
133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  with gr.Row():
135
  with gr.Column():
136
+ file_upload = gr.File(
137
+ label="Upload Video File", file_count=1, scale=1)
138
+ video_preview = gr.Video(
139
+ label="Video Preview", scale=3, intervactive=False)
140
+ # with gr.Row():
141
+ # transcribe_btn = gr.Button(
142
+ # "Transcribe Audio")
143
 
 
 
 
 
 
 
144
  with gr.Column():
145
+ text_in = gr.HighlightedText(
146
+ label="Transcription", combine_adjacent=False, show_legend=True, color_map={"+": "green", "-": "red"})
147
  with gr.Row():
148
+ cut_btn = gr.Button("Cut Video")
149
+ select_all_words = gr.Button("Select All Words")
150
+ reset_words = gr.Button("Reset Words")
151
+ video_out = gr.Video(label="Video Out")
152
+ with gr.Row():
153
+ gr.Examples(
154
+ fn=speech_to_text,
155
+ examples=["./examples/ShiaLaBeouf.mp4",
156
+ "./examples/zuckyuval.mp4",
157
+ "./examples/cooking.mp4"],
158
+ inputs=[file_upload],
159
+ outputs=[text_in, transcription_var,
160
+ timestamps_var, video_preview],
161
+ cache_examples=True)
162
+
163
  with gr.Row():
164
  gr.Markdown("""
165
  #### Video Credits
 
168
  1. [Shia LaBeouf "Just Do It"](https://www.youtube.com/watch?v=n2lTxIk_Dr0)
169
  1. [Mark Zuckerberg & Yuval Noah Harari in Conversation](https://www.youtube.com/watch?v=Boj9eD0Wug8)
170
  """)
171
+
172
+ def select_text(evt: gr.SelectData, timestamps_var):
173
+ index = evt.index
174
+ timestamps_var[index]['state'] = not timestamps_var[index]['state']
175
+ words = [(word['word'], '+' if word['state'] else '-')
176
+ for word in timestamps_var]
177
+ return timestamps_var, words
178
+
179
+ def words_selection(timestamps_var, reset=False):
180
+ if reset:
181
+ for word in timestamps_var:
182
+ word['state'] = True
183
+ else:
184
+ # reverse the state of all words
185
+ for word in timestamps_var:
186
+ word['state'] = False
187
+
188
+ words = [(word['word'], '+' if word['state'] else '-')
189
+ for word in timestamps_var]
190
+ return timestamps_var, words
191
+
192
+ file_upload.upload(speech_to_text, inputs=[file_upload], outputs=[
193
+ text_in, transcription_var, timestamps_var, video_preview])
194
+ select_all_words.click(words_selection, inputs=[timestamps_var], outputs=[
195
+ timestamps_var, text_in], queue=False, show_progress=False)
196
+ reset_words.click(lambda x: words_selection(x, True), inputs=[timestamps_var], outputs=[
197
+ timestamps_var, text_in], queue=False, show_progress=False)
198
+ text_in.select(select_text, inputs=timestamps_var,
199
+ outputs=[timestamps_var, text_in], queue=False, show_progress=False)
200
+ # transcribe_btn.click(speech_to_text, inputs=[file_upload], outputs=[
201
+ # text_in, transcription_var, timestamps_var, video_preview])
202
+ cut_btn.click(cut_timestamps_to_video, [
203
+ file_upload, timestamps_var], [video_out])
204
+
205
  demo.queue()
206
  if __name__ == "__main__":
207
  demo.launch(debug=True)
pre-requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ pip
requirements.txt CHANGED
@@ -1,6 +1,12 @@
 
1
  torch
2
- transformers
3
- gradio==3.35.2
 
 
 
 
 
4
  datasets
5
  librosa
6
  ffmpeg-python
 
1
+ git+https://github.com/huggingface/transformers.git
2
  torch
3
+ torchaudio
4
+ soundfile
5
+ librosa
6
+ moviepy
7
+ matplotlib
8
+ pillow
9
+ gradio
10
  datasets
11
  librosa
12
  ffmpeg-python