ysharma HF staff commited on
Commit
f09eeaa
1 Parent(s): 0bce71c
Files changed (1) hide show
  1. app.py +93 -39
app.py CHANGED
@@ -1,23 +1,63 @@
1
  import gradio as gr
2
-
3
- #final
4
- import gradio as gr
5
- #import json
6
- #from difflib import Differ
7
  import ffmpeg
8
- #import os
9
  from pathlib import Path
10
- #import time
11
 
12
  API_URL = "https://api-inference.huggingface.co/models/facebook/wav2vec2-base-960h"
13
  #headers = {"Authorization": "Bearer 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"}
14
  HF_TOKEN = os.environ["HF_TOKEN"]
15
  headers = {"Authorization": f"Bearer {HF_TOKEN}"}
16
 
17
- #convert video to audio
18
- video_path = Path("./ShiaLaBeouf.mp4")
19
- audio_memory, _ = ffmpeg.input(video_path).output('-', format="wav", ac=1, ar='16k').overwrite_output().global_args('-loglevel', 'quiet').run(capture_stdout=True)
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  #calling the hosted model
22
  def query_api(audio_bytes: bytes):
23
  """
@@ -38,20 +78,8 @@ def query_api(audio_bytes: bytes):
38
  json_reponse = json.loads(response.content.decode("utf-8"))
39
  return json_reponse
40
 
41
- #Getting transcripts using wav2Vec2 huggingface hosted accelerated inference
42
- #sending audio file in request along with stride and chunk length information
43
- model_response = query_api(audio_memory)
44
-
45
- #model response has both - transcripts as well as character timestamps or chunks
46
- transcription = model_response["text"].lower()
47
- chnk = model_response["chunks"]
48
-
49
- #creating lists from chunks to consume downstream easily
50
- timestamps = [[chunk["text"].lower(), chunk["timestamp"][0], chunk["timestamp"][1]]
51
- for chunk in chnk]
52
 
53
-
54
- #getting word timestams from character timestamps
55
  def get_word_timestamps(timestamps):
56
  words, word = [], []
57
  letter_timestamp, word_timestamp, words_timestamp = [], [], []
@@ -68,15 +96,6 @@ def get_word_timestamps(timestamps):
68
  words = [word.strip() for word in words]
69
  return words, words_timestamp
70
 
71
- words, words_timestamp = get_word_timestamps(timestamps)
72
- #words = [word.strip() for word in words]
73
-
74
- print(f"Total words in the audio transcript is:{len(words)}, transcript word list is :{words}")
75
- print(f"Total Word timestamps derived fromcharacter timestamp are :{len(words_timestamp)}, Word timestamps are :{words_timestamp}")
76
-
77
- #creating list from input gif transcript
78
- gif = "don't let your dreams be dreams"
79
- giflist = gif.split()
80
 
81
  #getting index of gif words in main transcript
82
  def get_gif_word_indexes(total_words_list, gif_words_list):
@@ -90,11 +109,9 @@ def get_gif_word_indexes(total_words_list, gif_words_list):
90
  if total_words_list[idx:idx+lengthgif_words_list] == gif_words_list:
91
  yield tuple(range(idx, idx+lengthgif_words_list))
92
 
93
- #getting gif indexes from the generator
94
- giflist_indxs = list(list(get_gif_word_indexes(words, giflist))[0])
95
 
96
  #getting start and end timestamps for gif transcript
97
- def get_gif_timestamps(giflist_indxs):
98
  #giflist_indxs = list(list(get_gif_word_indexes(words, giflist))[0])
99
  min_idx = min(giflist_indxs)
100
  max_idx = max(giflist_indxs)
@@ -103,15 +120,52 @@ def get_gif_timestamps(giflist_indxs):
103
  start_seconds, end_seconds = gif_words_timestamp[0][0], gif_words_timestamp[-1][-1]
104
  return start_seconds, end_seconds
105
 
106
- #getting start and end timestamps for a gif video
107
- start_seconds, end_seconds = get_gif_timestamps(giflist_indxs)
108
 
109
  #extracting the video and building and serving a .gif image
110
  def generate_gif(start_seconds, end_seconds):
111
  final_clip = video.subclip(start_seconds, end_seconds)
112
  #final_clip.write_videofile("/content/gdrive/My Drive/AI/videoedit/gif1.mp4")
113
- final_clip.write_gif("/content/gdrive/My Drive/AI/videoedit/gif1.gif",)
114
  final_clip.close()
115
  return
116
 
117
- generate_gif(start_seconds, end_seconds)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
 
 
 
 
 
2
  import ffmpeg
 
3
  from pathlib import Path
 
4
 
5
  API_URL = "https://api-inference.huggingface.co/models/facebook/wav2vec2-base-960h"
6
  #headers = {"Authorization": "Bearer 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"}
7
  HF_TOKEN = os.environ["HF_TOKEN"]
8
  headers = {"Authorization": f"Bearer {HF_TOKEN}"}
9
 
 
 
 
10
 
11
+ def generate_transcripts(in_video): #generate_gifs(in_video, gif_transcript):
12
+ print("********* Inside generate_transcripts() **********")
13
+ #convert video to audio
14
+ print(f" input video is : {in_video}")
15
+
16
+ video_path = Path("./ShiaLaBeouf.mp4")
17
+ audio_memory, _ = ffmpeg.input(video_path).output('-', format="wav", ac=1, ar='16k').overwrite_output().global_args('-loglevel', 'quiet').run(capture_stdout=True)
18
+
19
+ #Getting transcripts using wav2Vec2 huggingface hosted accelerated inference
20
+ #sending audio file in request along with stride and chunk length information
21
+ model_response = query_api(audio_memory)
22
+
23
+ #model response has both - transcripts as well as character timestamps or chunks
24
+ transcription = model_response["text"].lower()
25
+ chnk = model_response["chunks"]
26
+
27
+ #creating lists from chunks to consume downstream easily
28
+ timestamps = [[chunk["text"].lower(), chunk["timestamp"][0], chunk["timestamp"][1]]
29
+ for chunk in chnk]
30
+
31
+ #getting words and word timestamps
32
+ words, words_timestamp = get_word_timestamps(timestamps)
33
+ print(f"Total words in the audio transcript is:{len(words)}, transcript word list is :{words}")
34
+ print(f"Total Word timestamps derived fromcharacter timestamp are :{len(words_timestamp)}, Word timestamps are :{words_timestamp}")
35
+
36
+ return transcription, words, words_timestamp
37
+
38
+
39
+ def generate_gifs(gif_transcript, words, words_timestamp):
40
+ print("********* Inside generate_gifs() **********")
41
+
42
+ #creating list from input gif transcript
43
+ gif = "don't let your dreams be dreams"
44
+ #gif = gif_transcript
45
+ giflist = gif.split()
46
+
47
+ #getting gif indexes from the generator
48
+ giflist_indxs = list(list(get_gif_word_indexes(words, giflist))[0])
49
+
50
+ #getting start and end timestamps for a gif video
51
+ start_seconds, end_seconds = get_gif_timestamps(giflist_indxs, words_timestamp)
52
+
53
+ #generated .gif image
54
+ generate_gif(start_seconds, end_seconds)
55
+ #("./gifimage.gif")
56
+ html_out = "<img src='./gifimage.gif' />"
57
+
58
+ return html_out
59
+
60
+
61
  #calling the hosted model
62
  def query_api(audio_bytes: bytes):
63
  """
 
78
  json_reponse = json.loads(response.content.decode("utf-8"))
79
  return json_reponse
80
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
+ #getting word timestamps from character timestamps
 
83
  def get_word_timestamps(timestamps):
84
  words, word = [], []
85
  letter_timestamp, word_timestamp, words_timestamp = [], [], []
 
96
  words = [word.strip() for word in words]
97
  return words, words_timestamp
98
 
 
 
 
 
 
 
 
 
 
99
 
100
  #getting index of gif words in main transcript
101
  def get_gif_word_indexes(total_words_list, gif_words_list):
 
109
  if total_words_list[idx:idx+lengthgif_words_list] == gif_words_list:
110
  yield tuple(range(idx, idx+lengthgif_words_list))
111
 
 
 
112
 
113
  #getting start and end timestamps for gif transcript
114
+ def get_gif_timestamps(giflist_indxs, words_timestamp):
115
  #giflist_indxs = list(list(get_gif_word_indexes(words, giflist))[0])
116
  min_idx = min(giflist_indxs)
117
  max_idx = max(giflist_indxs)
 
120
  start_seconds, end_seconds = gif_words_timestamp[0][0], gif_words_timestamp[-1][-1]
121
  return start_seconds, end_seconds
122
 
 
 
123
 
124
  #extracting the video and building and serving a .gif image
125
  def generate_gif(start_seconds, end_seconds):
126
  final_clip = video.subclip(start_seconds, end_seconds)
127
  #final_clip.write_videofile("/content/gdrive/My Drive/AI/videoedit/gif1.mp4")
128
+ final_clip.write_gif("./gifimage.gif",)
129
  final_clip.close()
130
  return
131
 
132
+
133
+ sample_video = ['./ShiaLaBeouf.mp4']
134
+ sample_vid = gr.Video(label='Video file') #for displaying the example
135
+ examples = gr.components.Dataset(components=[sample_vid], samples=[sample_video], type='values')
136
+
137
+
138
+ demo = gr.Blocks()
139
+
140
+ with demo:
141
+ with gr.Row():
142
+ input_video = gr.Video(label="Upload a Video", visible=True) #for incoming video
143
+ text_transcript = gr.Textbox(label="Transcripts", lines = 10, interactive = True ) #to generate and display transcriptions for input video
144
+ text_words = gr.Textbox(visible=False)
145
+ text_wordstimestamps = gr.Textbox(visible=False)
146
+ text_gif_transcript = gr.Textbox(label="Transcripts", placeholder="Copy paste transcripts here to create GIF image" , lines = 3, interactive = True ) #to copy paste required gif transcript
147
+ out_gif = gr.HTML(label="Generated GIF from transcript selected", show_label=True)
148
+
149
+ examples.render()
150
+ def load_examples(video): #to load sample video into input_video upon clicking on it
151
+ print("****** inside load_example() ******")
152
+ print("in_video is : ", video)
153
+ return video
154
+
155
+ examples.click(load_examples, examples, input_video)
156
+
157
+ with gr.Row():
158
+ button_transcript = gr.Button("Generate transcripts")
159
+ button_gifs = gr.Button("Create Gif")
160
+
161
+ #def load_gif():
162
+ # print("****** inside load_gif() ******")
163
+ # #created embedding width='560' height='315'
164
+ # html_out = "<img src='./gifimage.gif' />"
165
+ # print(f"html output is : {html_out}")
166
+ # return
167
+
168
+ button_transcript.click(generate_transcripts, input_video, [text_transcript, text_words, text_wordstimestamps ])
169
+ button_gifs.click(generate_gifs, [text_gif_transcript, text_words, text_wordstimestamps], out_gif )
170
+
171
+