Samuelblue commited on
Commit
c9f34d6
1 Parent(s): 190d480

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +329 -0
app.py ADDED
@@ -0,0 +1,329 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ from difflib import Differ
4
+ import ffmpeg
5
+ import os
6
+ from pathlib import Path
7
+ import time
8
+ import aiohttp
9
+ import asyncio
10
+ from transformers import pipeline
11
+
12
+ # Set true if you're using huggingface inference API API https://huggingface.co/inference-api
13
+ API_BACKEND = True
14
+ MODEL = "facebook/wav2vec2-base-960h"
15
+
16
+ if API_BACKEND:
17
+ from dotenv import load_dotenv
18
+ import base64
19
+ import asyncio
20
+ load_dotenv(Path(".env"))
21
+
22
+ HF_TOKEN = os.environ["HF_TOKEN"] = ""
23
+ headers = {"Authorization": f"Bearer {HF_TOKEN}"}
24
+ API_URL = f'https://api-inference.huggingface.co/models/{MODEL}'
25
+
26
+ else:
27
+ import torch
28
+ from transformers import pipeline
29
+
30
+ # is cuda available?
31
+ cuda = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
32
+ device = 0 if torch.cuda.is_available() else -1
33
+ speech_recognizer = pipeline(
34
+ task="automatic-speech-recognition",
35
+ model=MODEL,
36
+ tokenizer=MODEL,
37
+ framework="pt",
38
+ device=device,
39
+ )
40
+
41
+ videos_out_path = Path("./videos_out")
42
+ videos_out_path.mkdir(parents=True, exist_ok=True)
43
+
44
+ samples_data = sorted(Path('examples').glob('*.json'))
45
+ SAMPLES = []
46
+ for file in samples_data:
47
+ with open(file) as f:
48
+ sample = json.load(f)
49
+ SAMPLES.append(sample)
50
+ VIDEOS = list(map(lambda x: [x['video']], SAMPLES))
51
+
52
+ total_inferences_since_reboot = 0
53
+ total_cuts_since_reboot = 0
54
+
55
+
56
+ async def speech_to_text(video_file_path):
57
+ """
58
+ Takes a video path to convert to audio, transcribe audio channel to text and char timestamps
59
+
60
+ Using https://huggingface.co/tasks/automatic-speech-recognition pipeline
61
+ """
62
+ global total_inferences_since_reboot
63
+ if (video_file_path == None):
64
+ raise ValueError("Error no video input")
65
+
66
+ video_path = Path(video_file_path)
67
+ try:
68
+ # convert video to audio 16k using PIPE to audio_memory
69
+ audio_memory, _ = ffmpeg.input(video_path).output(
70
+ '-', format="wav", ac=1, ar='16k').overwrite_output().global_args('-loglevel', 'quiet').run(capture_stdout=True)
71
+ except Exception as e:
72
+ raise RuntimeError("Error converting video to audio")
73
+
74
+ ping("speech_to_text")
75
+ last_time = time.time()
76
+ if API_BACKEND:
77
+ # Using Inference API https://huggingface.co/inference-api
78
+ # try twice, because the model must be loaded
79
+ for i in range(10):
80
+ for tries in range(4):
81
+ print(f'Transcribing from API attempt {tries}')
82
+ try:
83
+ inference_reponse = await query_api(audio_memory)
84
+ print(inference_reponse)
85
+ transcription = inference_reponse["text"].lower()
86
+ timestamps = [[chunk["text"].lower(), chunk["timestamp"][0], chunk["timestamp"][1]]
87
+ for chunk in inference_reponse['chunks']]
88
+
89
+ total_inferences_since_reboot += 1
90
+ print("\n\ntotal_inferences_since_reboot: ",
91
+ total_inferences_since_reboot, "\n\n")
92
+ return (transcription, transcription, timestamps)
93
+ except Exception as e:
94
+ print(e)
95
+ if 'error' in inference_reponse and 'estimated_time' in inference_reponse:
96
+ wait_time = inference_reponse['estimated_time']
97
+ print("Waiting for model to load....", wait_time)
98
+ # wait for loading model
99
+ # 5 seconds plus for certainty
100
+ await asyncio.sleep(wait_time + 5.0)
101
+ elif 'error' in inference_reponse:
102
+ raise RuntimeError("Error Fetching API",
103
+ inference_reponse['error'])
104
+ else:
105
+ break
106
+ else:
107
+ raise RuntimeError(inference_reponse, "Error Fetching API")
108
+ else:
109
+ try:
110
+ print(f'Transcribing via local model')
111
+ output = speech_recognizer(
112
+ audio_memory, return_timestamps="char", chunk_length_s=10, stride_length_s=(4, 2))
113
+
114
+ transcription = output["text"].lower()
115
+ timestamps = [[chunk["text"].lower(), chunk["timestamp"][0].tolist(), chunk["timestamp"][1].tolist()]
116
+ for chunk in output['chunks']]
117
+ total_inferences_since_reboot += 1
118
+
119
+ print("\n\ntotal_inferences_since_reboot: ",
120
+ total_inferences_since_reboot, "\n\n")
121
+ return (transcription, transcription, timestamps)
122
+ except Exception as e:
123
+ raise RuntimeError("Error Running inference with local model", e)
124
+
125
+
126
+ async def cut_timestamps_to_video(video_in, transcription, text_in, timestamps, words_to_cut):
127
+ """
128
+ Given original video input, text transcript + timestamps,
129
+ and edit text, cuts video segments into a single video
130
+ """
131
+ global total_cuts_since_reboot
132
+
133
+ video_path = Path(video_in)
134
+ video_file_name = video_path.stem
135
+ if (video_in == None or text_in == None or transcription == None):
136
+ raise ValueError("Inputs undefined")
137
+
138
+ d = Differ()
139
+ # compare original transcription with edit text
140
+ diff_chars = d.compare(transcription, text_in)
141
+
142
+ # Include additions in the filtered list
143
+ filtered = list(filter(lambda x: x[0] != '-' and x[0] != '+', diff_chars))
144
+
145
+ # Update grouping logic to handle additions and word cuts
146
+ idx = 0
147
+ grouped = {}
148
+ word_cuts = []
149
+ for (a, b) in zip(filtered, timestamps):
150
+ if a[0] != '-':
151
+ if idx in grouped:
152
+ grouped[idx].append(b)
153
+ else:
154
+ grouped[idx] = []
155
+ grouped[idx].append(b)
156
+ elif a[0] == '-':
157
+ idx += 1
158
+ elif a[0] == '+':
159
+ word_cuts.append(b)
160
+
161
+ # after grouping, gets the lower and upper start and time for each group
162
+ timestamps_to_cut = [[v[0][1], v[-1][2]] for v in grouped.values()]
163
+
164
+ # Add word cut timestamps
165
+ timestamps_to_cut.extend(word_cuts)
166
+
167
+ between_str = '+'.join(
168
+ map(lambda t: f'between(t,{t[0]},{t[1]})', timestamps_to_cut))
169
+
170
+ if timestamps_to_cut:
171
+ video_file = ffmpeg.input(video_in)
172
+ video = video_file.video.filter(
173
+ "select", f'({between_str})').filter("setpts", "N/FRAME_RATE/TB")
174
+ audio = video_file.audio.filter(
175
+ "aselect", f'({between_str})').filter("asetpts", "N/SR/TB")
176
+
177
+ output_video = f'./videos_out/{video_file_name}.mp4'
178
+ ffmpeg.concat(video, audio, v=1, a=1).output(
179
+ output_video).overwrite_output().global_args('-loglevel', 'quiet').run()
180
+ else:
181
+ output_video = video_in
182
+
183
+ tokens = [(token[2:], token[0] if token[0] != " " else None)
184
+ for token in filtered]
185
+
186
+ total_cuts_since_reboot += 1
187
+ ping("video_cuts")
188
+ print("\n\ntotal_cuts_since_reboot: ", total_cuts_since_reboot, "\n\n")
189
+ return (tokens, output_video)
190
+
191
+
192
+ async def query_api(audio_bytes: bytes):
193
+ """
194
+ Query for Huggingface Inference API for Automatic Speech Recognition task
195
+ """
196
+ payload = json.dumps({
197
+ "inputs": base64.b64encode(audio_bytes).decode("utf-8"),
198
+ "parameters": {
199
+ "return_timestamps": "char",
200
+ "chunk_length_s": 10,
201
+ "stride_length_s": [4, 2]
202
+ },
203
+ "options": {"use_gpu": False}
204
+ }).encode("utf-8")
205
+ async with aiohttp.ClientSession() as session:
206
+ async with session.post(API_URL, headers=headers, data=payload) as response:
207
+ print("API Response: ", response.status)
208
+ if response.headers['Content-Type'] == 'application/json':
209
+ return await response.json()
210
+ elif response.headers['Content-Type'] == 'application/octet-stream':
211
+ return await response.read()
212
+ elif response.headers['Content-Type'] == 'text/plain':
213
+ return await response.text()
214
+ else:
215
+ raise RuntimeError("Error Fetching API")
216
+
217
+
218
+ def ping(name):
219
+ url = f'https://huggingface.co/api/telemetry/spaces/radames/edit-video-by-editing-text/{name}'
220
+ print("ping: ", url)
221
+
222
+ async def req():
223
+ async with aiohttp.ClientSession() as session:
224
+ async with session.get(url) as response:
225
+ print("pong: ", response.status)
226
+ asyncio.create_task(req())
227
+
228
+
229
+ def find_words_to_cut(transcription, words):
230
+ """
231
+ Find timestamps of words to cut in the transcription
232
+ """
233
+ timestamps_to_cut = []
234
+ for word in words:
235
+ word_lower = word.lower()
236
+ word_positions = [i for i, x in enumerate(transcription.split()) if x.lower() == word_lower]
237
+ for pos in word_positions:
238
+ timestamps_to_cut.append([transcription[pos:].find(' '), pos + 1])
239
+
240
+ return timestamps_to_cut
241
+
242
+
243
+ # ---- Gradio Layout -----
244
+ video_in = gr.Video(label="Video file", elem_id="video-container")
245
+ text_in = gr.Textbox(label="Transcription", lines=10, interactive=True)
246
+ words_to_cut_input = gr.Textbox(label="Words to Cut (comma-separated)", lines=1, default="")
247
+ video_out = gr.Video(label="Video Out")
248
+ diff_out = gr.HighlightedText(label="Cuts Diffs", combine_adjacent=True)
249
+ examples = gr.Dataset(components=[video_in], samples=VIDEOS, type="index")
250
+
251
+ css = """
252
+ #cut_btn, #reset_btn { align-self:stretch; }
253
+ #\\31 3 { max-width: 540px; }
254
+ .output-markdown {max-width: 65ch !important;}
255
+ #video-container{
256
+ max-width: 40rem;
257
+ }
258
+ """
259
+ with gr.Blocks(css=css) as demo:
260
+ transcription_var = gr.State()
261
+ timestamps_var = gr.State()
262
+ with gr.Row():
263
+ with gr.Column():
264
+ gr.Markdown("""
265
+ # Edit Video By Editing Text
266
+ This project is a quick proof of concept of a simple video editor where the edits
267
+ are made by editing the audio transcription.
268
+ Using the [Huggingface Automatic Speech Recognition Pipeline](https://huggingface.co/tasks/automatic-speech-recognition)
269
+ with a fine-tuned model, you can predict the text transcription and character or word-based timestamps.
270
+ """)
271
+
272
+ with gr.Row():
273
+ examples.render()
274
+
275
+ def load_example(id):
276
+ video = SAMPLES[id]['video']
277
+ transcription = SAMPLES[id]['transcription'].lower()
278
+ timestamps = SAMPLES[id]['timestamps']
279
+
280
+ return (video, transcription, transcription, timestamps)
281
+
282
+ examples.click(
283
+ load_example,
284
+ inputs=[examples],
285
+ outputs=[video_in, text_in, transcription_var, timestamps_var],
286
+ queue=False)
287
+
288
+ with gr.Row():
289
+ with gr.Column():
290
+ video_in.render()
291
+ transcribe_btn = gr.Button("Transcribe Audio")
292
+ transcribe_btn.click(speech_to_text, [video_in], [
293
+ text_in, transcription_var, timestamps_var])
294
+
295
+ with gr.Row():
296
+ gr.Markdown("""
297
+ ### Now edit as text
298
+ After running the video transcription, you can make cuts to the text below (only cuts, not additions!)
299
+ You can also specify words to cut in the "Words to Cut" input box.
300
+ """)
301
+
302
+ with gr.Row():
303
+ with gr.Column():
304
+ text_in.render()
305
+ words_to_cut_input.render()
306
+ with gr.Row():
307
+ cut_btn = gr.Button("Cut to video", elem_id="cut_btn")
308
+ # send audio path and hidden variables
309
+ cut_btn.click(cut_timestamps_to_video, [
310
+ video_in, transcription_var, text_in, timestamps_var, words_to_cut_input], [diff_out, video_out])
311
+
312
+ reset_transcription = gr.Button(
313
+ "Reset to last transcription", elem_id="reset_btn")
314
+ reset_transcription.click(
315
+ lambda x: x, transcription_var, text_in)
316
+ with gr.Column():
317
+ video_out.render()
318
+ diff_out.render()
319
+
320
+ with gr.Row():
321
+ gr.Markdown("""
322
+ #### Video Credits
323
+
324
+ 1. [Cooking](https://vimeo.com/573792389)
325
+ 2. [Shia LaBeouf "Just Do It"](https://www.youtube.com/watch?v=n2lTxIk_Dr0)
326
+ 3. [Mark Zuckerberg & Yuval Noah Harari in Conversation](https://www.youtube.com/watch?v=Boj9eD0Wug8)
327
+ """)
328
+
329
+ demo.launch(debug=True)