ysharma HF staff commited on
Commit
c92afc1
β€’
1 Parent(s): 9428f01

create app.py

Browse files
Files changed (1) hide show
  1. app.py +270 -0
app.py ADDED
@@ -0,0 +1,270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import ffmpeg
3
+ from pathlib import Path
4
+ import os
5
+ import ast
6
+ import json
7
+ import base64
8
+ import requests
9
+ import moviepy.editor as mp
10
+ from PIL import Image, ImageSequence
11
+ import cv2
12
+
13
+
14
+ API_URL = "https://api-inference.huggingface.co/models/facebook/wav2vec2-base-960h"
15
+ HF_TOKEN = os.environ["HF_TOKEN"]
16
+ headers = {"Authorization": f"Bearer {HF_TOKEN}"}
17
+
18
+
19
+ def generate_transcripts(in_video): #generate_gifs(in_video, gif_transcript):
20
+ print("********* Inside generate_transcripts() **********")
21
+ #convert video to audio
22
+ print(f" input video is : {in_video}")
23
+
24
+ #sample
25
+ video_path = Path("./ShiaLaBeouf.mp4")
26
+ audio_memory, _ = ffmpeg.input(in_video).output('-', format="wav", ac=1, ar='16k').overwrite_output().global_args('-loglevel', 'quiet').run(capture_stdout=True)
27
+ #audio_memory, _ = ffmpeg.input(video_path).output('-', format="wav", ac=1, ar='16k').overwrite_output().global_args('-loglevel', 'quiet').run(capture_stdout=True)
28
+
29
+ #Getting transcripts using wav2Vec2 huggingface hosted accelerated inference
30
+ #sending audio file in request along with stride and chunk length information
31
+ model_response = query_api(audio_memory)
32
+
33
+ #model response has both - transcripts as well as character timestamps or chunks
34
+ print(f"model_response is : {model_response}")
35
+ transcription = model_response["text"].lower()
36
+ chnk = model_response["chunks"]
37
+
38
+ #creating lists from chunks to consume downstream easily
39
+ timestamps = [[chunk["text"].lower(), chunk["timestamp"][0], chunk["timestamp"][1]]
40
+ for chunk in chnk]
41
+
42
+ #getting words and word timestamps
43
+ words, words_timestamp = get_word_timestamps(timestamps)
44
+ print(f"Total words in the audio transcript is:{len(words)}, transcript word list is :{words}, type of words is :{type(words)} ")
45
+ print(f"Total Word timestamps derived fromcharacter timestamp are :{len(words_timestamp)}, Word timestamps are :{words_timestamp}")
46
+
47
+ return transcription, words, words_timestamp
48
+
49
+
50
+ def generate_gifs(in_video, gif_transcript, words, words_timestamp):
51
+ print("********* Inside generate_gifs() **********")
52
+
53
+ #creating list from input gif transcript
54
+ #gif = "don't let your dreams be dreams"
55
+ gif = gif_transcript
56
+ #gif = gif_transcript
57
+ giflist = gif.split()
58
+
59
+ #getting gif indexes from the generator
60
+ # Converting string to list
61
+ words = ast.literal_eval(words)
62
+ words_timestamp = ast.literal_eval(words_timestamp)
63
+ print(f"words is :{words}")
64
+ print(f"type of words is :{type(words)}")
65
+ print(f"length of words is :{len(words)}")
66
+ print(f"giflist is :{giflist}")
67
+
68
+ giflist_indxs = list(list(get_gif_word_indexes(words, giflist))[0])
69
+ print(f"giflist_indxs is : {giflist_indxs}")
70
+ #getting start and end timestamps for a gif video
71
+ start_seconds, end_seconds = get_gif_timestamps(giflist_indxs, words_timestamp)
72
+ print(f"start_seconds, end_seconds are : ({start_seconds}, {end_seconds})")
73
+ #generated .gif image
74
+ #gif_out, vid_out = gen_moviepy_gif(in_video, start_seconds, end_seconds)
75
+ slomo_vid = gen_moviepy_gif(in_video, start_seconds, end_seconds)
76
+
77
+ return slomo_vid
78
+
79
+
80
+ #calling the hosted model
81
+ def query_api(audio_bytes: bytes):
82
+ """
83
+ Query for Huggingface Inference API for Automatic Speech Recognition task
84
+ """
85
+ print("********* Inside query_api() **********")
86
+ payload = json.dumps({
87
+ "inputs": base64.b64encode(audio_bytes).decode("utf-8"),
88
+ "parameters": {
89
+ "return_timestamps": "char",
90
+ "chunk_length_s": 10,
91
+ "stride_length_s": [4, 2]
92
+ },
93
+ "options": {"use_gpu": False}
94
+ }).encode("utf-8")
95
+
96
+ response = requests.request(
97
+ "POST", API_URL, headers=headers, data=payload)
98
+ json_reponse = json.loads(response.content.decode("utf-8"))
99
+ print(f"json_reponse is :{json_reponse}")
100
+ return json_reponse
101
+
102
+
103
+ #getting word timestamps from character timestamps
104
+ def get_word_timestamps(timestamps):
105
+ words, word = [], []
106
+ letter_timestamp, word_timestamp, words_timestamp = [], [], []
107
+ for idx,entry in enumerate(timestamps):
108
+ word.append(entry[0])
109
+ letter_timestamp.append(entry[1])
110
+ if entry[0] == ' ':
111
+ words.append(''.join(word))
112
+ word_timestamp.append(letter_timestamp[0])
113
+ word_timestamp.append(timestamps[idx-1][2])
114
+ words_timestamp.append(word_timestamp)
115
+ word, word_timestamp, letter_timestamp = [], [], []
116
+
117
+ words = [word.strip() for word in words]
118
+ return words, words_timestamp
119
+
120
+
121
+ #getting index of gif words in main transcript
122
+ def get_gif_word_indexes(total_words_list, gif_words_list):
123
+ if not gif_words_list:
124
+ return
125
+ # just optimization
126
+ COUNT=0
127
+ lengthgif_words_list = len(gif_words_list)
128
+ firstgif_words_list = gif_words_list[0]
129
+
130
+ print(f"total_words_list is :{total_words_list}")
131
+ print(f"length of total_words_list is :{len(total_words_list)}")
132
+ print(f"gif_words_list is :{gif_words_list}")
133
+ print(f"length of gif_words_list is :{len(gif_words_list)}")
134
+
135
+ for idx, item in enumerate(total_words_list):
136
+ COUNT+=1
137
+ if item == firstgif_words_list:
138
+ if total_words_list[idx:idx+lengthgif_words_list] == gif_words_list:
139
+ print(f"value of tuple is : {tuple(range(idx, idx+lengthgif_words_list))}")
140
+ yield tuple(range(idx, idx+lengthgif_words_list))
141
+
142
+
143
+ #getting start and end timestamps for gif transcript
144
+ def get_gif_timestamps(giflist_indxs, words_timestamp):
145
+ print(f"******** Inside get_gif_timestamps() **********")
146
+ min_idx = min(giflist_indxs)
147
+ max_idx = max(giflist_indxs)
148
+ print(f"min_idx is :{min_idx}")
149
+ print(f"max_idx is :{max_idx}")
150
+
151
+ gif_words_timestamp = words_timestamp[min_idx : max_idx+1]
152
+ print(f"words_timestamp is :{words_timestamp}")
153
+ print(f"gif_words_timestamp is :{gif_words_timestamp}")
154
+
155
+ start_seconds, end_seconds = gif_words_timestamp[0][0], gif_words_timestamp[-1][-1]
156
+ print(f"start_seconds, end_seconds are :{start_seconds},{end_seconds}")
157
+
158
+ return start_seconds, end_seconds
159
+
160
+
161
+ #extracting the video and building and serving a .gif image
162
+ def gen_moviepy_gif(in_video, start_seconds, end_seconds):
163
+ print("******** inside moviepy_gif () ***************")
164
+ #sample
165
+ video_path = "./ShiaLaBeouf.mp4"
166
+ video = mp.VideoFileClip(in_video)
167
+ #video = mp.VideoFileClip(video_path)
168
+
169
+ final_clip = video.subclip(start_seconds, end_seconds)
170
+
171
+ #slowmo
172
+ slomo_clip = video.subclip(mp.vfx.speedx, 0.5)
173
+ slomo_clip.write_videofile("slomo.mp4")
174
+
175
+ #writing to RAM
176
+ final_clip.write_gif("gifimage.gif") #, program='ffmpeg', tempfiles=True, fps=15, fuzz=3)
177
+ final_clip.write_videofile("gifimage.mp4")
178
+ final_clip.close()
179
+ #reading in a variable
180
+ gif_img = mp.VideoFileClip("gifimage.gif")
181
+ #gif_vid = mp.VideoFileClip("gifimage.mp4")
182
+ #im = Image.open("gifimage.gif")
183
+ #vid_cap = cv2.VideoCapture('gifimage.mp4')
184
+ return "slomo.mp4" #"gifimage.gif", "gifimage.mp4" #im, gif_img, gif_vid, vid_cap, #"gifimage.mp4"
185
+
186
+
187
+ sample_video = ['./ShiaLaBeouf.mp4']
188
+ sample_vid = gr.Video(label='Video file') #for displaying the example
189
+ examples = gr.components.Dataset(components=[sample_vid], samples=[sample_video], type='values')
190
+
191
+
192
+ demo = gr.Blocks()
193
+
194
+ with demo:
195
+ gr.Markdown("""# **Create Any GIF From Your Favorite Videos!** """)
196
+ gr.Markdown("""
197
+ ### Now you can get your own unlimited supply of cool GIFs and reactions from the videos you most like..
198
+
199
+ A Space by [Yuvraj Sharma](https://huggingface.co/ysharma). Some cool sample .gif images generated using this Space -
200
+
201
+ <table>
202
+ <tr>
203
+ <td>Sample GIF 1</td>
204
+ <td>Sample GIF 2</td>
205
+ <td>Sample GIF 3</td>
206
+ </tr>
207
+ <tr>
208
+ <td><img src='https://media.giphy.com/media/IP69ha9NNIXJFqR4BI/giphy.gif' width='40%'></td>
209
+ <td><img src='https://media.giphy.com/media/YAH1yXag018HutbnfX/giphy.gif' width='40%'></td>
210
+ <td><img src='https://media.giphy.com/media/jNx9j9ENo6hQ3GnR95/giphy.gif' width='40%'></td>
211
+ </tr>
212
+ </table>
213
+
214
+ **Motivation and background:** In this Gradio-Space cum Blog, I will be taking you through my efforts in reproducing the brilliant app [Edit Video By Editing Text](https://huggingface.co/spaces/radames/edit-video-by-editing-text) by [@radames](https://huggingface.co/radames). My valule-adds are -
215
+ - A permanent supply for your own new GIFs
216
+ - This Space written in the form of a Notebook or a Blog if I may, to help someone understand how they can too build this kind of an app.
217
+
218
+ **How To Use:** 1. Upload a video or simply click on the Shia LaBeouf's sample provided here.
219
+ 2. Then click on 'Generate transcripts' button and first textbox will display the extract Transcript from the audio associated with your sample.
220
+ 3. Clip the text from transcript or type manually in the second Textbox provided.
221
+ 4. A .Gif image will get generated on the right hand side of animated Shia Labeouf!
222
+
223
+ Hopee you have fun using this πŸ˜€
224
+ """)
225
+
226
+ with gr.Row():
227
+ #for incoming video
228
+ input_video = gr.Video(label="Upload a Video", visible=True)
229
+ #to generate and display transcriptions for input video
230
+ text_transcript = gr.Textbox(label="Transcripts", lines = 10, interactive = True )
231
+
232
+ #Just to move dgata between function hence keeping visible false
233
+ text_words = gr.Textbox(visible=False)
234
+ text_wordstimestamps = gr.Textbox(visible=False)
235
+
236
+ #to copy paste required gif transcript / or to populate by itslef on pressing the button
237
+ text_gif_transcript = gr.Textbox(label="Transcripts", placeholder="Copy paste transcripts here to create GIF image" , lines = 3, interactive = True )
238
+
239
+ def load_gif_text(text):
240
+ print("****** inside load_gif_text() ******")
241
+ print("text for gif is : ", text)
242
+ return text
243
+
244
+ text_transcript.change(load_gif_text, text_transcript, text_gif_transcript )
245
+
246
+ #out_gif = gr.Image(label="Generated GIF image")
247
+ out_slomo_vid = gr.Video(label="Generated GIF image")
248
+
249
+ with gr.Row():
250
+ button_transcript = gr.Button("Generate transcripts")
251
+ button_gifs = gr.Button("Create Gif")
252
+
253
+ with gr.Row():
254
+ #to render video example on mouse hover/click
255
+ examples.render()
256
+ #to load sample video into input_video upon clicking on it
257
+ def load_examples(video):
258
+ print("****** inside load_example() ******")
259
+ print("in_video is : ", video[0])
260
+ return video[0]
261
+
262
+ examples.click(load_examples, examples, input_video)
263
+
264
+
265
+
266
+ button_transcript.click(generate_transcripts, input_video, [text_transcript, text_words, text_wordstimestamps ])
267
+ button_gifs.click(generate_gifs, [input_video, text_gif_transcript, text_words, text_wordstimestamps], out_slomo_vid )
268
+
269
+
270
+ demo.launch(debug=True)