Spaces:
Runtime error
Runtime error
create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,270 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import ffmpeg
|
3 |
+
from pathlib import Path
|
4 |
+
import os
|
5 |
+
import ast
|
6 |
+
import json
|
7 |
+
import base64
|
8 |
+
import requests
|
9 |
+
import moviepy.editor as mp
|
10 |
+
from PIL import Image, ImageSequence
|
11 |
+
import cv2
|
12 |
+
|
13 |
+
|
14 |
+
API_URL = "https://api-inference.huggingface.co/models/facebook/wav2vec2-base-960h"
|
15 |
+
HF_TOKEN = os.environ["HF_TOKEN"]
|
16 |
+
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
|
17 |
+
|
18 |
+
|
19 |
+
def generate_transcripts(in_video): #generate_gifs(in_video, gif_transcript):
|
20 |
+
print("********* Inside generate_transcripts() **********")
|
21 |
+
#convert video to audio
|
22 |
+
print(f" input video is : {in_video}")
|
23 |
+
|
24 |
+
#sample
|
25 |
+
video_path = Path("./ShiaLaBeouf.mp4")
|
26 |
+
audio_memory, _ = ffmpeg.input(in_video).output('-', format="wav", ac=1, ar='16k').overwrite_output().global_args('-loglevel', 'quiet').run(capture_stdout=True)
|
27 |
+
#audio_memory, _ = ffmpeg.input(video_path).output('-', format="wav", ac=1, ar='16k').overwrite_output().global_args('-loglevel', 'quiet').run(capture_stdout=True)
|
28 |
+
|
29 |
+
#Getting transcripts using wav2Vec2 huggingface hosted accelerated inference
|
30 |
+
#sending audio file in request along with stride and chunk length information
|
31 |
+
model_response = query_api(audio_memory)
|
32 |
+
|
33 |
+
#model response has both - transcripts as well as character timestamps or chunks
|
34 |
+
print(f"model_response is : {model_response}")
|
35 |
+
transcription = model_response["text"].lower()
|
36 |
+
chnk = model_response["chunks"]
|
37 |
+
|
38 |
+
#creating lists from chunks to consume downstream easily
|
39 |
+
timestamps = [[chunk["text"].lower(), chunk["timestamp"][0], chunk["timestamp"][1]]
|
40 |
+
for chunk in chnk]
|
41 |
+
|
42 |
+
#getting words and word timestamps
|
43 |
+
words, words_timestamp = get_word_timestamps(timestamps)
|
44 |
+
print(f"Total words in the audio transcript is:{len(words)}, transcript word list is :{words}, type of words is :{type(words)} ")
|
45 |
+
print(f"Total Word timestamps derived fromcharacter timestamp are :{len(words_timestamp)}, Word timestamps are :{words_timestamp}")
|
46 |
+
|
47 |
+
return transcription, words, words_timestamp
|
48 |
+
|
49 |
+
|
50 |
+
def generate_gifs(in_video, gif_transcript, words, words_timestamp):
|
51 |
+
print("********* Inside generate_gifs() **********")
|
52 |
+
|
53 |
+
#creating list from input gif transcript
|
54 |
+
#gif = "don't let your dreams be dreams"
|
55 |
+
gif = gif_transcript
|
56 |
+
#gif = gif_transcript
|
57 |
+
giflist = gif.split()
|
58 |
+
|
59 |
+
#getting gif indexes from the generator
|
60 |
+
# Converting string to list
|
61 |
+
words = ast.literal_eval(words)
|
62 |
+
words_timestamp = ast.literal_eval(words_timestamp)
|
63 |
+
print(f"words is :{words}")
|
64 |
+
print(f"type of words is :{type(words)}")
|
65 |
+
print(f"length of words is :{len(words)}")
|
66 |
+
print(f"giflist is :{giflist}")
|
67 |
+
|
68 |
+
giflist_indxs = list(list(get_gif_word_indexes(words, giflist))[0])
|
69 |
+
print(f"giflist_indxs is : {giflist_indxs}")
|
70 |
+
#getting start and end timestamps for a gif video
|
71 |
+
start_seconds, end_seconds = get_gif_timestamps(giflist_indxs, words_timestamp)
|
72 |
+
print(f"start_seconds, end_seconds are : ({start_seconds}, {end_seconds})")
|
73 |
+
#generated .gif image
|
74 |
+
#gif_out, vid_out = gen_moviepy_gif(in_video, start_seconds, end_seconds)
|
75 |
+
slomo_vid = gen_moviepy_gif(in_video, start_seconds, end_seconds)
|
76 |
+
|
77 |
+
return slomo_vid
|
78 |
+
|
79 |
+
|
80 |
+
#calling the hosted model
|
81 |
+
def query_api(audio_bytes: bytes):
|
82 |
+
"""
|
83 |
+
Query for Huggingface Inference API for Automatic Speech Recognition task
|
84 |
+
"""
|
85 |
+
print("********* Inside query_api() **********")
|
86 |
+
payload = json.dumps({
|
87 |
+
"inputs": base64.b64encode(audio_bytes).decode("utf-8"),
|
88 |
+
"parameters": {
|
89 |
+
"return_timestamps": "char",
|
90 |
+
"chunk_length_s": 10,
|
91 |
+
"stride_length_s": [4, 2]
|
92 |
+
},
|
93 |
+
"options": {"use_gpu": False}
|
94 |
+
}).encode("utf-8")
|
95 |
+
|
96 |
+
response = requests.request(
|
97 |
+
"POST", API_URL, headers=headers, data=payload)
|
98 |
+
json_reponse = json.loads(response.content.decode("utf-8"))
|
99 |
+
print(f"json_reponse is :{json_reponse}")
|
100 |
+
return json_reponse
|
101 |
+
|
102 |
+
|
103 |
+
#getting word timestamps from character timestamps
|
104 |
+
def get_word_timestamps(timestamps):
|
105 |
+
words, word = [], []
|
106 |
+
letter_timestamp, word_timestamp, words_timestamp = [], [], []
|
107 |
+
for idx,entry in enumerate(timestamps):
|
108 |
+
word.append(entry[0])
|
109 |
+
letter_timestamp.append(entry[1])
|
110 |
+
if entry[0] == ' ':
|
111 |
+
words.append(''.join(word))
|
112 |
+
word_timestamp.append(letter_timestamp[0])
|
113 |
+
word_timestamp.append(timestamps[idx-1][2])
|
114 |
+
words_timestamp.append(word_timestamp)
|
115 |
+
word, word_timestamp, letter_timestamp = [], [], []
|
116 |
+
|
117 |
+
words = [word.strip() for word in words]
|
118 |
+
return words, words_timestamp
|
119 |
+
|
120 |
+
|
121 |
+
#getting index of gif words in main transcript
|
122 |
+
def get_gif_word_indexes(total_words_list, gif_words_list):
|
123 |
+
if not gif_words_list:
|
124 |
+
return
|
125 |
+
# just optimization
|
126 |
+
COUNT=0
|
127 |
+
lengthgif_words_list = len(gif_words_list)
|
128 |
+
firstgif_words_list = gif_words_list[0]
|
129 |
+
|
130 |
+
print(f"total_words_list is :{total_words_list}")
|
131 |
+
print(f"length of total_words_list is :{len(total_words_list)}")
|
132 |
+
print(f"gif_words_list is :{gif_words_list}")
|
133 |
+
print(f"length of gif_words_list is :{len(gif_words_list)}")
|
134 |
+
|
135 |
+
for idx, item in enumerate(total_words_list):
|
136 |
+
COUNT+=1
|
137 |
+
if item == firstgif_words_list:
|
138 |
+
if total_words_list[idx:idx+lengthgif_words_list] == gif_words_list:
|
139 |
+
print(f"value of tuple is : {tuple(range(idx, idx+lengthgif_words_list))}")
|
140 |
+
yield tuple(range(idx, idx+lengthgif_words_list))
|
141 |
+
|
142 |
+
|
143 |
+
#getting start and end timestamps for gif transcript
|
144 |
+
def get_gif_timestamps(giflist_indxs, words_timestamp):
|
145 |
+
print(f"******** Inside get_gif_timestamps() **********")
|
146 |
+
min_idx = min(giflist_indxs)
|
147 |
+
max_idx = max(giflist_indxs)
|
148 |
+
print(f"min_idx is :{min_idx}")
|
149 |
+
print(f"max_idx is :{max_idx}")
|
150 |
+
|
151 |
+
gif_words_timestamp = words_timestamp[min_idx : max_idx+1]
|
152 |
+
print(f"words_timestamp is :{words_timestamp}")
|
153 |
+
print(f"gif_words_timestamp is :{gif_words_timestamp}")
|
154 |
+
|
155 |
+
start_seconds, end_seconds = gif_words_timestamp[0][0], gif_words_timestamp[-1][-1]
|
156 |
+
print(f"start_seconds, end_seconds are :{start_seconds},{end_seconds}")
|
157 |
+
|
158 |
+
return start_seconds, end_seconds
|
159 |
+
|
160 |
+
|
161 |
+
#extracting the video and building and serving a .gif image
|
162 |
+
def gen_moviepy_gif(in_video, start_seconds, end_seconds):
|
163 |
+
print("******** inside moviepy_gif () ***************")
|
164 |
+
#sample
|
165 |
+
video_path = "./ShiaLaBeouf.mp4"
|
166 |
+
video = mp.VideoFileClip(in_video)
|
167 |
+
#video = mp.VideoFileClip(video_path)
|
168 |
+
|
169 |
+
final_clip = video.subclip(start_seconds, end_seconds)
|
170 |
+
|
171 |
+
#slowmo
|
172 |
+
slomo_clip = video.subclip(mp.vfx.speedx, 0.5)
|
173 |
+
slomo_clip.write_videofile("slomo.mp4")
|
174 |
+
|
175 |
+
#writing to RAM
|
176 |
+
final_clip.write_gif("gifimage.gif") #, program='ffmpeg', tempfiles=True, fps=15, fuzz=3)
|
177 |
+
final_clip.write_videofile("gifimage.mp4")
|
178 |
+
final_clip.close()
|
179 |
+
#reading in a variable
|
180 |
+
gif_img = mp.VideoFileClip("gifimage.gif")
|
181 |
+
#gif_vid = mp.VideoFileClip("gifimage.mp4")
|
182 |
+
#im = Image.open("gifimage.gif")
|
183 |
+
#vid_cap = cv2.VideoCapture('gifimage.mp4')
|
184 |
+
return "slomo.mp4" #"gifimage.gif", "gifimage.mp4" #im, gif_img, gif_vid, vid_cap, #"gifimage.mp4"
|
185 |
+
|
186 |
+
|
187 |
+
sample_video = ['./ShiaLaBeouf.mp4']
|
188 |
+
sample_vid = gr.Video(label='Video file') #for displaying the example
|
189 |
+
examples = gr.components.Dataset(components=[sample_vid], samples=[sample_video], type='values')
|
190 |
+
|
191 |
+
|
192 |
+
demo = gr.Blocks()
|
193 |
+
|
194 |
+
with demo:
|
195 |
+
gr.Markdown("""# **Create Any GIF From Your Favorite Videos!** """)
|
196 |
+
gr.Markdown("""
|
197 |
+
### Now you can get your own unlimited supply of cool GIFs and reactions from the videos you most like..
|
198 |
+
|
199 |
+
A Space by [Yuvraj Sharma](https://huggingface.co/ysharma). Some cool sample .gif images generated using this Space -
|
200 |
+
|
201 |
+
<table>
|
202 |
+
<tr>
|
203 |
+
<td>Sample GIF 1</td>
|
204 |
+
<td>Sample GIF 2</td>
|
205 |
+
<td>Sample GIF 3</td>
|
206 |
+
</tr>
|
207 |
+
<tr>
|
208 |
+
<td><img src='https://media.giphy.com/media/IP69ha9NNIXJFqR4BI/giphy.gif' width='40%'></td>
|
209 |
+
<td><img src='https://media.giphy.com/media/YAH1yXag018HutbnfX/giphy.gif' width='40%'></td>
|
210 |
+
<td><img src='https://media.giphy.com/media/jNx9j9ENo6hQ3GnR95/giphy.gif' width='40%'></td>
|
211 |
+
</tr>
|
212 |
+
</table>
|
213 |
+
|
214 |
+
**Motivation and background:** In this Gradio-Space cum Blog, I will be taking you through my efforts in reproducing the brilliant app [Edit Video By Editing Text](https://huggingface.co/spaces/radames/edit-video-by-editing-text) by [@radames](https://huggingface.co/radames). My valule-adds are -
|
215 |
+
- A permanent supply for your own new GIFs
|
216 |
+
- This Space written in the form of a Notebook or a Blog if I may, to help someone understand how they can too build this kind of an app.
|
217 |
+
|
218 |
+
**How To Use:** 1. Upload a video or simply click on the Shia LaBeouf's sample provided here.
|
219 |
+
2. Then click on 'Generate transcripts' button and first textbox will display the extract Transcript from the audio associated with your sample.
|
220 |
+
3. Clip the text from transcript or type manually in the second Textbox provided.
|
221 |
+
4. A .Gif image will get generated on the right hand side of animated Shia Labeouf!
|
222 |
+
|
223 |
+
Hopee you have fun using this π
|
224 |
+
""")
|
225 |
+
|
226 |
+
with gr.Row():
|
227 |
+
#for incoming video
|
228 |
+
input_video = gr.Video(label="Upload a Video", visible=True)
|
229 |
+
#to generate and display transcriptions for input video
|
230 |
+
text_transcript = gr.Textbox(label="Transcripts", lines = 10, interactive = True )
|
231 |
+
|
232 |
+
#Just to move dgata between function hence keeping visible false
|
233 |
+
text_words = gr.Textbox(visible=False)
|
234 |
+
text_wordstimestamps = gr.Textbox(visible=False)
|
235 |
+
|
236 |
+
#to copy paste required gif transcript / or to populate by itslef on pressing the button
|
237 |
+
text_gif_transcript = gr.Textbox(label="Transcripts", placeholder="Copy paste transcripts here to create GIF image" , lines = 3, interactive = True )
|
238 |
+
|
239 |
+
def load_gif_text(text):
|
240 |
+
print("****** inside load_gif_text() ******")
|
241 |
+
print("text for gif is : ", text)
|
242 |
+
return text
|
243 |
+
|
244 |
+
text_transcript.change(load_gif_text, text_transcript, text_gif_transcript )
|
245 |
+
|
246 |
+
#out_gif = gr.Image(label="Generated GIF image")
|
247 |
+
out_slomo_vid = gr.Video(label="Generated GIF image")
|
248 |
+
|
249 |
+
with gr.Row():
|
250 |
+
button_transcript = gr.Button("Generate transcripts")
|
251 |
+
button_gifs = gr.Button("Create Gif")
|
252 |
+
|
253 |
+
with gr.Row():
|
254 |
+
#to render video example on mouse hover/click
|
255 |
+
examples.render()
|
256 |
+
#to load sample video into input_video upon clicking on it
|
257 |
+
def load_examples(video):
|
258 |
+
print("****** inside load_example() ******")
|
259 |
+
print("in_video is : ", video[0])
|
260 |
+
return video[0]
|
261 |
+
|
262 |
+
examples.click(load_examples, examples, input_video)
|
263 |
+
|
264 |
+
|
265 |
+
|
266 |
+
button_transcript.click(generate_transcripts, input_video, [text_transcript, text_words, text_wordstimestamps ])
|
267 |
+
button_gifs.click(generate_gifs, [input_video, text_gif_transcript, text_words, text_wordstimestamps], out_slomo_vid )
|
268 |
+
|
269 |
+
|
270 |
+
demo.launch(debug=True)
|