ysharma HF staff commited on
Commit
237602b
β€’
1 Parent(s): 5c66834

created app.py

Browse files
Files changed (1) hide show
  1. app.py +303 -0
app.py ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import ffmpeg
3
+ from pathlib import Path
4
+ import os
5
+ import ast
6
+ import json
7
+ import base64
8
+ import requests
9
+ import moviepy.editor as mp
10
+ from PIL import Image, ImageSequence
11
+ import cv2
12
+
13
+ API_URL = "https://api-inference.huggingface.co/models/facebook/wav2vec2-base-960h"
14
+ HF_TOKEN = os.environ["HF_TOKEN"]
15
+ headers = {"Authorization": f"Bearer {HF_TOKEN}"}
16
+
17
+ video_list = []
18
+
19
+ def generate_transcripts(in_video): #generate_gifs(in_video, gif_transcript):
20
+ print("********* Inside generate_transcripts() **********")
21
+ #convert video to audio
22
+ print(f" input video is : {in_video}")
23
+
24
+ #sample
25
+ #video_path = Path("./ShiaLaBeouf.mp4")
26
+ audio_memory, _ = ffmpeg.input(in_video).output('-', format="wav", ac=1, ar='16k').overwrite_output().global_args('-loglevel', 'quiet').run(capture_stdout=True)
27
+ #audio_memory, _ = ffmpeg.input(video_path).output('-', format="wav", ac=1, ar='16k').overwrite_output().global_args('-loglevel', 'quiet').run(capture_stdout=True)
28
+
29
+ #Getting transcripts using wav2Vec2 huggingface hosted accelerated inference
30
+ #sending audio file in request along with stride and chunk length information
31
+ model_response = query_api(audio_memory)
32
+
33
+ #model response has both - transcripts as well as character timestamps or chunks
34
+ print(f"model_response is : {model_response}")
35
+ transcription = model_response["text"].lower()
36
+ chnk = model_response["chunks"]
37
+
38
+ #creating lists from chunks to consume downstream easily
39
+ timestamps = [[chunk["text"].lower(), chunk["timestamp"][0], chunk["timestamp"][1]]
40
+ for chunk in chnk]
41
+
42
+ #getting words and word timestamps
43
+ words, words_timestamp = get_word_timestamps(timestamps)
44
+ print(f"Total words in the audio transcript is:{len(words)}, transcript word list is :{words}, type of words is :{type(words)} ")
45
+ print(f"Total Word timestamps derived fromcharacter timestamp are :{len(words_timestamp)}, Word timestamps are :{words_timestamp}")
46
+
47
+ return transcription, words, words_timestamp
48
+
49
+
50
+ def generate_gifs(in_video, gif_transcript, words, words_timestamp, vid_speed):
51
+ print("********* Inside generate_gifs() **********")
52
+
53
+ #creating list from input gif transcript
54
+ #gif = "don't let your dreams be dreams"
55
+ gif = gif_transcript
56
+ #gif = gif_transcript
57
+ giflist = gif.split()
58
+
59
+ #getting gif indexes from the generator
60
+ # Converting string to list
61
+ words = ast.literal_eval(words)
62
+ words_timestamp = ast.literal_eval(words_timestamp)
63
+ print(f"words is :{words}")
64
+ print(f"type of words is :{type(words)}")
65
+ print(f"length of words is :{len(words)}")
66
+ print(f"giflist is :{giflist}")
67
+
68
+ giflist_indxs = list(list(get_gif_word_indexes(words, giflist))[0])
69
+ print(f"giflist_indxs is : {giflist_indxs}")
70
+ #getting start and end timestamps for a gif video
71
+ start_seconds, end_seconds = get_gif_timestamps(giflist_indxs, words_timestamp)
72
+ print(f"start_seconds, end_seconds are : ({start_seconds}, {end_seconds})")
73
+ #generated .gif image
74
+ #gif_out, vid_out = gen_moviepy_gif(in_video, start_seconds, end_seconds)
75
+ print(f"vid_speed from SLider is : {vid_speed}")
76
+
77
+ speededit_vids_list, concat_vid = gen_moviepy_gif(in_video, start_seconds, end_seconds, float(vid_speed), video_list)
78
+
79
+ return concat_vid #speededit_vids_list
80
+
81
+
82
+ #calling the hosted model
83
+ def query_api(audio_bytes: bytes):
84
+ """
85
+ Query for Huggingface Inference API for Automatic Speech Recognition task
86
+ """
87
+ print("********* Inside query_api() **********")
88
+ payload = json.dumps({
89
+ "inputs": base64.b64encode(audio_bytes).decode("utf-8"),
90
+ "parameters": {
91
+ "return_timestamps": "char",
92
+ "chunk_length_s": 10,
93
+ "stride_length_s": [4, 2]
94
+ },
95
+ "options": {"use_gpu": False}
96
+ }).encode("utf-8")
97
+
98
+ response = requests.request(
99
+ "POST", API_URL, headers=headers, data=payload)
100
+ json_reponse = json.loads(response.content.decode("utf-8"))
101
+ print(f"json_reponse is :{json_reponse}")
102
+ return json_reponse
103
+
104
+
105
+ #getting word timestamps from character timestamps
106
+ def get_word_timestamps(timestamps):
107
+ words, word = [], []
108
+ letter_timestamp, word_timestamp, words_timestamp = [], [], []
109
+ for idx,entry in enumerate(timestamps):
110
+ word.append(entry[0])
111
+ letter_timestamp.append(entry[1])
112
+ if entry[0] == ' ':
113
+ words.append(''.join(word))
114
+ word_timestamp.append(letter_timestamp[0])
115
+ word_timestamp.append(timestamps[idx-1][2])
116
+ words_timestamp.append(word_timestamp)
117
+ word, word_timestamp, letter_timestamp = [], [], []
118
+
119
+ words = [word.strip() for word in words]
120
+ return words, words_timestamp
121
+
122
+
123
+ #getting index of gif words in main transcript
124
+ def get_gif_word_indexes(total_words_list, gif_words_list):
125
+ if not gif_words_list:
126
+ return
127
+ # just optimization
128
+ COUNT=0
129
+ lengthgif_words_list = len(gif_words_list)
130
+ firstgif_words_list = gif_words_list[0]
131
+
132
+ print(f"total_words_list is :{total_words_list}")
133
+ print(f"length of total_words_list is :{len(total_words_list)}")
134
+ print(f"gif_words_list is :{gif_words_list}")
135
+ print(f"length of gif_words_list is :{len(gif_words_list)}")
136
+
137
+ for idx, item in enumerate(total_words_list):
138
+ COUNT+=1
139
+ if item == firstgif_words_list:
140
+ if total_words_list[idx:idx+lengthgif_words_list] == gif_words_list:
141
+ print(f"value of tuple is : {tuple(range(idx, idx+lengthgif_words_list))}")
142
+ yield tuple(range(idx, idx+lengthgif_words_list))
143
+
144
+
145
+ #getting start and end timestamps for gif transcript
146
+ def get_gif_timestamps(giflist_indxs, words_timestamp):
147
+ print(f"******** Inside get_gif_timestamps() **********")
148
+ min_idx = min(giflist_indxs)
149
+ max_idx = max(giflist_indxs)
150
+ print(f"min_idx is :{min_idx}")
151
+ print(f"max_idx is :{max_idx}")
152
+
153
+ gif_words_timestamp = words_timestamp[min_idx : max_idx+1]
154
+ print(f"words_timestamp is :{words_timestamp}")
155
+ print(f"gif_words_timestamp is :{gif_words_timestamp}")
156
+
157
+ start_seconds, end_seconds = gif_words_timestamp[0][0], gif_words_timestamp[-1][-1]
158
+ print(f"start_seconds, end_seconds are :{start_seconds},{end_seconds}")
159
+
160
+ return start_seconds, end_seconds
161
+
162
+
163
+ #extracting the video and building and serving a .gif image
164
+ def gen_moviepy_gif(in_video, start_seconds, end_seconds, vid_speed, vid_list):
165
+ print("******** inside moviepy_gif () ***************")
166
+ #sample
167
+ #video_path = "./ShiaLaBeouf.mp4"
168
+ video = mp.VideoFileClip(in_video)
169
+ #video = mp.VideoFileClip(video_path)
170
+
171
+ leftover_clip_start = video.subclip(0, int(start_seconds) + float("{:.2f}".format(1-start_seconds%1))).without_audio() #float("{:.2f}".format(1-a%1))
172
+ final_clip = video.subclip(start_seconds, end_seconds)
173
+ leftover_clip_end = video.subclip(int(end_seconds) + float("{:.2f}".format(1-end_seconds%1)) ).without_audio() #end=None
174
+
175
+ #slowmo
176
+ print(f"vid_speed from calling function is : {vid_speed}")
177
+ speededit_clip = final_clip.fx(mp.vfx.speedx, vid_speed)
178
+ speededit_clip = speededit_clip.without_audio()
179
+
180
+ #concat
181
+ concatenated_clip = mp.concatenate_videoclips([leftover_clip_start, speededit_clip, leftover_clip_end])
182
+ concatenated_clip.write_videofile("concat.mp4")
183
+
184
+ filename = f"speededit{len(vid_list)}"
185
+ print("filename is :",filename)
186
+ speededit_clip.write_videofile("speededit.mp4") #(filename)
187
+ vid_list.append("speededit.mp4") #(filename)
188
+
189
+ if len(vid_list) == 1:
190
+ speededit_clip.write_videofile("slomo.mp4")
191
+ elif len(vid_list) == 2:
192
+ speededit_clip.write_videofile("timelapse.mp4")
193
+
194
+ #writing to RAM - gif and smaller clip
195
+ #final_clip.write_gif("gifimage.gif") #, program='ffmpeg', tempfiles=True, fps=15, fuzz=3)
196
+ #final_clip.write_videofile("gifimage.mp4")
197
+ final_clip.close()
198
+ #reading in a variable
199
+ #gif_img = mp.VideoFileClip("gifimage.gif")
200
+ #gif_vid = mp.VideoFileClip("gifimage.mp4")
201
+ #im = Image.open("gifimage.gif")
202
+ #vid_cap = cv2.VideoCapture('gifimage.mp4')
203
+ return vid_list, "concat.mp4" #"slomo.mp4", "timelapse.mp4", #"gifimage.gif", "gifimage.mp4" #im, gif_img, gif_vid, vid_cap, #"gifimage.mp4"
204
+
205
+
206
+ sample_video = ["olympic100m.mp4"] #[['./ShiaLaBeouf.mp4']]
207
+ sample_vid = gr.Video(label='Video file') #for displaying the example
208
+ examples = gr.components.Dataset(components=[sample_vid], samples=[sample_video], type='values')
209
+
210
+
211
+ demo = gr.Blocks()
212
+
213
+ with demo:
214
+ gr.Markdown("""# **Watch your video in SloMo or in Timelapse!** """)
215
+ gr.Markdown("""
216
+ ### **This is still work under progres......** Editing your video using ASR pipeline..
217
+
218
+ A Space by [Yuvraj Sharma](https://huggingface.co/ysharma).
219
+
220
+ **Background:** In this Gradio BLocks Party Space, I am trying to -
221
+ - Provide a capability to slow down your video
222
+ - Timelapse your video
223
+
224
+ **How To Use:** 1. Upload a video or simply click on the sample provided here.
225
+ 2. Then click on 'Generate transcripts' button and first textbox will display the extract Transcript from the audio associated with your sample.
226
+ 3. Clip the text from transcript or type transcripts manually in the second Textbox provided.
227
+ 4. A slowed down or timelapsed version of your video will get generated on the right hand side !
228
+
229
+ Hope you have fun using this πŸ˜€
230
+ """)
231
+
232
+ with gr.Row():
233
+ #for incoming video
234
+ input_video = gr.Video(label="Upload a Video", visible=True)
235
+ #to generate and display transcriptions for input video
236
+ text_transcript = gr.Textbox(label="Transcripts", lines = 10, interactive = True )
237
+
238
+ #Just to move data between function hence keeping visible false
239
+ text_words = gr.Textbox(visible=False)
240
+ text_wordstimestamps = gr.Textbox(visible=False)
241
+
242
+ with gr.Row():
243
+ button_transcript = gr.Button("Generate transcripts")
244
+
245
+ #For SlowMo
246
+ with gr.Row():
247
+ #to copy paste required gif transcript / or to populate by itself on pressing the button
248
+ text_slomo_transcript = gr.Textbox(label="Transcripts", placeholder="Copy paste transcripts here to create SlowMo Video" , lines = 5, interactive = True )
249
+
250
+ def load_slomo_text(text):
251
+ print("****** inside load_slomo_text() ******")
252
+ print("text for slomo video is : ", text)
253
+ return text
254
+
255
+ text_transcript.change(load_slomo_text, text_transcript, text_slomo_transcript )
256
+
257
+ #out_gif = gr.Image(label="Generated GIF image")
258
+ out_slomo_vid = gr.Video(label="Generated SlowMo Video")
259
+
260
+ with gr.Row():
261
+ #button_transcript = gr.Button("Generate transcripts")
262
+ vid_speed_slomo = gr.Slider(0.1,0.9, step=0.1)
263
+ button_slomo = gr.Button("Create SloMo")
264
+
265
+ #For TimeLapse
266
+ with gr.Row():
267
+ #to copy paste required gif transcript / or to populate by itself on pressing the button
268
+ text_timelapse_transcript = gr.Textbox(label="Transcripts", placeholder="Copy paste transcripts here to create GIF image" , lines = 5) #, interactive = True )
269
+
270
+ def load_timelapse_text(text):
271
+ print("****** inside load_timelapse_text() ******")
272
+ print("text for timelapse video is : ", text)
273
+ return text
274
+
275
+ text_transcript.change(load_timelapse_text, text_transcript, text_timelapse_transcript )
276
+
277
+ #out_gif = gr.Image(label="Generated GIF image")
278
+ out_timelapse_vid = gr.Video(label="Generated TimeLapse Video")
279
+
280
+ with gr.Row():
281
+ #button_transcript = gr.Button("Generate transcripts")
282
+ vid_speed_timelapse = gr.Slider(1,2, step=0.25)
283
+ button_timelapse = gr.Button("Create TimeLapse")
284
+
285
+ with gr.Row():
286
+ #to render video example on mouse hover/click
287
+ examples.render()
288
+ #to load sample video into input_video upon clicking on it
289
+ def load_examples(video):
290
+ print("****** inside load_example() ******")
291
+ print("in_video is : ", video[0])
292
+ return video[0]
293
+
294
+ examples.click(load_examples, examples, input_video)
295
+
296
+ #vid_speed = gr.Slider(0.1,0.9, step=0.1)
297
+
298
+
299
+ button_transcript.click(generate_transcripts, input_video, [text_transcript, text_words, text_wordstimestamps ])
300
+ button_slomo.click(generate_gifs, [input_video, text_slomo_transcript, text_words, text_wordstimestamps, vid_speed_slomo], out_slomo_vid )
301
+ button_timelapse.click(generate_gifs, [out_slomo_vid, text_timelapse_transcript, text_words, text_wordstimestamps, vid_speed_timelapse], out_timelapse_vid )
302
+
303
+ demo.launch(debug=True)