radames commited on
Commit
f02b7b3
·
1 Parent(s): 505b98a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -66
app.py CHANGED
@@ -228,7 +228,7 @@ def ping(name):
228
 
229
 
230
  # ---- Gradio Layout -----
231
- video_in = gr.Video(label="Video file")
232
  text_in = gr.Textbox(label="Transcription", lines=10, interactive=True)
233
  video_out = gr.Video(label="Video Out")
234
  diff_out = gr.HighlightedText(label="Cuts Diffs", combine_adjacent=True)
@@ -238,78 +238,76 @@ css = """
238
  #cut_btn, #reset_btn { align-self:stretch; }
239
  #\\31 3 { max-width: 540px; }
240
  .output-markdown {max-width: 65ch !important;}
241
- #container{
242
- margin: 0 auto;
243
  max-width: 40rem;
244
  }
245
  """
246
  with gr.Blocks(css=css) as demo:
247
- with gr.Column(elem_id="container"):
248
  transcription_var = gr.State()
249
- timestamps_var = gr.State()
250
- with gr.Row():
251
- with gr.Column():
252
- gr.Markdown("""
253
- # Edit Video By Editing Text
254
- This project is a quick proof of concept of a simple video editor where the edits
255
- are made by editing the audio transcription.
256
- Using the [Huggingface Automatic Speech Recognition Pipeline](https://huggingface.co/tasks/automatic-speech-recognition)
257
- with a fine tuned [Wav2Vec2 model using Connectionist Temporal Classification (CTC)](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self)
258
- you can predict not only the text transcription but also the [character or word base timestamps](https://huggingface.co/docs/transformers/v4.19.2/en/main_classes/pipelines#transformers.AutomaticSpeechRecognitionPipeline.__call__.return_timestamps)
259
- """)
260
-
261
- with gr.Row():
262
-
263
- examples.render()
264
-
265
- def load_example(id):
266
- video = SAMPLES[id]['video']
267
- transcription = SAMPLES[id]['transcription'].lower()
268
- timestamps = SAMPLES[id]['timestamps']
269
-
270
- return (video, transcription, transcription, timestamps)
271
-
272
- examples.click(
273
- load_example,
274
- inputs=[examples],
275
- outputs=[video_in, text_in, transcription_var, timestamps_var],
276
- queue=False)
277
- with gr.Row():
278
- with gr.Column():
279
- video_in.render()
280
- transcribe_btn = gr.Button("Transcribe Audio")
281
- transcribe_btn.click(speech_to_text, [video_in], [
282
- text_in, transcription_var, timestamps_var])
283
-
284
- with gr.Row():
285
  gr.Markdown("""
286
- ### Now edit as text
287
- After running the video transcription, you can make cuts to the text below (only cuts, not additions!)""")
288
-
289
- with gr.Row():
290
- with gr.Column():
291
- text_in.render()
292
- with gr.Row():
293
- cut_btn = gr.Button("Cut to video", elem_id="cut_btn")
294
- # send audio path and hidden variables
295
- cut_btn.click(cut_timestamps_to_video, [
296
- video_in, transcription_var, text_in, timestamps_var], [diff_out, video_out])
297
-
298
- reset_transcription = gr.Button(
299
- "Reset to last trascription", elem_id="reset_btn")
300
- reset_transcription.click(
301
- lambda x: x, transcription_var, text_in)
302
- with gr.Column():
303
- video_out.render()
304
- diff_out.render()
305
- with gr.Row():
306
- gr.Markdown("""
307
- #### Video Credits
308
-
309
- 1. [Cooking](https://vimeo.com/573792389)
310
- 1. [Shia LaBeouf "Just Do It"](https://www.youtube.com/watch?v=n2lTxIk_Dr0)
311
- 1. [Mark Zuckerberg & Yuval Noah Harari in Conversation](https://www.youtube.com/watch?v=Boj9eD0Wug8)
312
  """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
  demo.queue()
314
  if __name__ == "__main__":
315
  demo.launch(debug=True)
 
228
 
229
 
230
  # ---- Gradio Layout -----
231
+ video_in = gr.Video(label="Video file", elem_id="video-container")
232
  text_in = gr.Textbox(label="Transcription", lines=10, interactive=True)
233
  video_out = gr.Video(label="Video Out")
234
  diff_out = gr.HighlightedText(label="Cuts Diffs", combine_adjacent=True)
 
238
  #cut_btn, #reset_btn { align-self:stretch; }
239
  #\\31 3 { max-width: 540px; }
240
  .output-markdown {max-width: 65ch !important;}
241
+ #video-container{
 
242
  max-width: 40rem;
243
  }
244
  """
245
  with gr.Blocks(css=css) as demo:
 
246
  transcription_var = gr.State()
247
+ timestamps_var = gr.State()
248
+ with gr.Row():
249
+ with gr.Column():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  gr.Markdown("""
251
+ # Edit Video By Editing Text
252
+ This project is a quick proof of concept of a simple video editor where the edits
253
+ are made by editing the audio transcription.
254
+ Using the [Huggingface Automatic Speech Recognition Pipeline](https://huggingface.co/tasks/automatic-speech-recognition)
255
+ with a fine tuned [Wav2Vec2 model using Connectionist Temporal Classification (CTC)](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self)
256
+ you can predict not only the text transcription but also the [character or word base timestamps](https://huggingface.co/docs/transformers/v4.19.2/en/main_classes/pipelines#transformers.AutomaticSpeechRecognitionPipeline.__call__.return_timestamps)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
  """)
258
+
259
+ with gr.Row():
260
+
261
+ examples.render()
262
+
263
+ def load_example(id):
264
+ video = SAMPLES[id]['video']
265
+ transcription = SAMPLES[id]['transcription'].lower()
266
+ timestamps = SAMPLES[id]['timestamps']
267
+
268
+ return (video, transcription, transcription, timestamps)
269
+
270
+ examples.click(
271
+ load_example,
272
+ inputs=[examples],
273
+ outputs=[video_in, text_in, transcription_var, timestamps_var],
274
+ queue=False)
275
+ with gr.Row():
276
+ with gr.Column():
277
+ video_in.render()
278
+ transcribe_btn = gr.Button("Transcribe Audio")
279
+ transcribe_btn.click(speech_to_text, [video_in], [
280
+ text_in, transcription_var, timestamps_var])
281
+
282
+ with gr.Row():
283
+ gr.Markdown("""
284
+ ### Now edit as text
285
+ After running the video transcription, you can make cuts to the text below (only cuts, not additions!)""")
286
+
287
+ with gr.Row():
288
+ with gr.Column():
289
+ text_in.render()
290
+ with gr.Row():
291
+ cut_btn = gr.Button("Cut to video", elem_id="cut_btn")
292
+ # send audio path and hidden variables
293
+ cut_btn.click(cut_timestamps_to_video, [
294
+ video_in, transcription_var, text_in, timestamps_var], [diff_out, video_out])
295
+
296
+ reset_transcription = gr.Button(
297
+ "Reset to last trascription", elem_id="reset_btn")
298
+ reset_transcription.click(
299
+ lambda x: x, transcription_var, text_in)
300
+ with gr.Column():
301
+ video_out.render()
302
+ diff_out.render()
303
+ with gr.Row():
304
+ gr.Markdown("""
305
+ #### Video Credits
306
+
307
+ 1. [Cooking](https://vimeo.com/573792389)
308
+ 1. [Shia LaBeouf "Just Do It"](https://www.youtube.com/watch?v=n2lTxIk_Dr0)
309
+ 1. [Mark Zuckerberg & Yuval Noah Harari in Conversation](https://www.youtube.com/watch?v=Boj9eD0Wug8)
310
+ """)
311
  demo.queue()
312
  if __name__ == "__main__":
313
  demo.launch(debug=True)