Spaces:
Build error
Build error
File size: 15,808 Bytes
527d5e6 27f75e4 b0a9549 f79bf90 edaaf4f c3d46df 7dbd9b3 ad3ea9b e37d735 ad3ea9b b0a9549 527d5e6 8af272d 527d5e6 f09eeaa b0a9549 f09eeaa 28d279e f09eeaa 605e778 f09eeaa 28d279e 6010350 92daced 28d279e b69cb8c 92daced 8da8563 f09eeaa 28c97fa f09eeaa 4a2e5d1 f09eeaa 6f851e0 8da8563 f09eeaa 527d5e6 b0a9549 527d5e6 b0a9549 527d5e6 f09eeaa 527d5e6 8f567e8 527d5e6 ffcd8ed 527d5e6 8f567e8 527d5e6 9ccc595 527d5e6 ef53b73 28c97fa 527d5e6 28c97fa 527d5e6 d10d751 28c97fa 527d5e6 28c97fa 527d5e6 7dbd9b3 1a39558 2a93029 8da8563 527d5e6 26a1200 ef53b73 cecb879 26a1200 ef53b73 8da8563 6f851e0 8da8563 6f851e0 203beb3 f09eeaa 6f851e0 3b2b7c8 14aeeff 3b2b7c8 14aeeff 3b2b7c8 6f851e0 f09eeaa ef53b73 f09eeaa ef53b73 cb3d639 c846c6f 6f851e0 cb3d639 c846c6f ef53b73 f09eeaa ef53b73 f09eeaa 4940b1c f09eeaa e905a92 fcd9555 e905a92 fcd9555 e905a92 fcd9555 e905a92 f09eeaa e905a92 fcd9555 e905a92 fcd9555 e905a92 209d5da f09eeaa 849dabd f09eeaa e3a5cd7 4f08108 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 |
import gradio as gr
import ffmpeg
from pathlib import Path
import os
import ast
import json
import base64
import requests
import moviepy.editor as mp
from PIL import Image, ImageSequence
import cv2
API_URL = "https://api-inference.huggingface.co/models/facebook/wav2vec2-base-960h"
headers = {"Authorization": "Bearer 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"}
#HF_TOKEN = os.environ["HF_TOKEN"]
#headers = {"Authorization": f"Bearer {HF_TOKEN}"}
def generate_transcripts(in_video): #generate_gifs(in_video, gif_transcript):
print("********* Inside generate_transcripts() **********")
#convert video to audio
print(f" input video is : {in_video}")
video_path = Path("./ShiaLaBeouf.mp4")
audio_memory, _ = ffmpeg.input(video_path).output('-', format="wav", ac=1, ar='16k').overwrite_output().global_args('-loglevel', 'quiet').run(capture_stdout=True)
#Getting transcripts using wav2Vec2 huggingface hosted accelerated inference
#sending audio file in request along with stride and chunk length information
model_response = query_api(audio_memory)
#model response has both - transcripts as well as character timestamps or chunks
print(f"model_response is : {model_response}")
transcription = model_response["text"].lower()
chnk = model_response["chunks"]
#creating lists from chunks to consume downstream easily
timestamps = [[chunk["text"].lower(), chunk["timestamp"][0], chunk["timestamp"][1]]
for chunk in chnk]
#getting words and word timestamps
words, words_timestamp = get_word_timestamps(timestamps)
print(f"Total words in the audio transcript is:{len(words)}, transcript word list is :{words}, type of words is :{type(words)} ")
print(f"Total Word timestamps derived fromcharacter timestamp are :{len(words_timestamp)}, Word timestamps are :{words_timestamp}")
return transcription, words, words_timestamp
def generate_gifs(gif_transcript, words, words_timestamp):
print("********* Inside generate_gifs() **********")
#creating list from input gif transcript
#gif = "don't let your dreams be dreams"
gif = gif_transcript
#gif = gif_transcript
giflist = gif.split()
#getting gif indexes from the generator
# Converting string to list
words = ast.literal_eval(words)
words_timestamp = ast.literal_eval(words_timestamp)
print(f"words is :{words}")
print(f"type of words is :{type(words)}")
print(f"length of words is :{len(words)}")
print(f"giflist is :{giflist}")
giflist_indxs = list(list(get_gif_word_indexes(words, giflist))[0])
print(f"giflist_indxs is : {giflist_indxs}")
#getting start and end timestamps for a gif video
start_seconds, end_seconds = get_gif_timestamps(giflist_indxs, words_timestamp)
print(f"start_seconds, end_seconds are : ({start_seconds}, {end_seconds})")
#generated .gif image
gif_out, vid_out = gen_moviepy_gif(start_seconds, end_seconds)
return gif_out
#calling the hosted model
def query_api(audio_bytes: bytes):
"""
Query for Huggingface Inference API for Automatic Speech Recognition task
"""
print("********* Inside query_api() **********")
payload = json.dumps({
"inputs": base64.b64encode(audio_bytes).decode("utf-8"),
"parameters": {
"return_timestamps": "char",
"chunk_length_s": 10,
"stride_length_s": [4, 2]
},
"options": {"use_gpu": False}
}).encode("utf-8")
response = requests.request(
"POST", API_URL, headers=headers, data=payload)
json_reponse = json.loads(response.content.decode("utf-8"))
print(f"json_reponse is :{json_reponse}")
return json_reponse
#getting word timestamps from character timestamps
def get_word_timestamps(timestamps):
words, word = [], []
letter_timestamp, word_timestamp, words_timestamp = [], [], []
for idx,entry in enumerate(timestamps):
word.append(entry[0])
letter_timestamp.append(entry[1])
if entry[0] == ' ':
words.append(''.join(word))
word_timestamp.append(letter_timestamp[0])
word_timestamp.append(timestamps[idx-1][2])
words_timestamp.append(word_timestamp)
word, word_timestamp, letter_timestamp = [], [], []
words = [word.strip() for word in words]
return words, words_timestamp
#getting index of gif words in main transcript
def get_gif_word_indexes(total_words_list, gif_words_list):
if not gif_words_list:
return
# just optimization
COUNT=0
lengthgif_words_list = len(gif_words_list)
firstgif_words_list = gif_words_list[0]
print(f"total_words_list is :{total_words_list}")
print(f"length of total_words_list is :{len(total_words_list)}")
print(f"gif_words_list is :{gif_words_list}")
print(f"length of gif_words_list is :{len(gif_words_list)}")
for idx, item in enumerate(total_words_list):
COUNT+=1
if item == firstgif_words_list:
if total_words_list[idx:idx+lengthgif_words_list] == gif_words_list:
print(f"value of tuple is : {tuple(range(idx, idx+lengthgif_words_list))}")
yield tuple(range(idx, idx+lengthgif_words_list))
#getting start and end timestamps for gif transcript
def get_gif_timestamps(giflist_indxs, words_timestamp):
print(f"******** Inside get_gif_timestamps() **********")
min_idx = min(giflist_indxs)
max_idx = max(giflist_indxs)
print(f"min_idx is :{min_idx}")
print(f"max_idx is :{max_idx}")
gif_words_timestamp = words_timestamp[min_idx : max_idx+1]
print(f"words_timestamp is :{words_timestamp}")
print(f"gif_words_timestamp is :{gif_words_timestamp}")
start_seconds, end_seconds = gif_words_timestamp[0][0], gif_words_timestamp[-1][-1]
print(f"start_seconds, end_seconds are :{start_seconds},{end_seconds}")
return start_seconds, end_seconds
#extracting the video and building and serving a .gif image
def gen_moviepy_gif(start_seconds, end_seconds):
print("******** inside moviepy_gif () ***************")
video_path = "./ShiaLaBeouf.mp4"
video = mp.VideoFileClip(video_path)
final_clip = video.subclip(start_seconds, end_seconds)
#writing to RAM
final_clip.write_gif("gifimage.gif") #, program='ffmpeg', tempfiles=True, fps=15, fuzz=3)
final_clip.write_videofile("gifimage.mp4")
final_clip.close()
#reading in a variable
gif_img = mp.VideoFileClip("gifimage.gif")
#gif_vid = mp.VideoFileClip("gifimage.mp4")
#im = Image.open("gifimage.gif")
#vid_cap = cv2.VideoCapture('gifimage.mp4')
return "gifimage.gif", "gifimage.mp4" #im, gif_img, gif_vid, vid_cap, #"gifimage.mp4"
sample_video = ['./ShiaLaBeouf.mp4']
sample_vid = gr.Video(label='Video file') #for displaying the example
examples = gr.components.Dataset(components=[sample_vid], samples=[sample_video], type='values')
demo = gr.Blocks()
with demo:
gr.Markdown("""# **Create Any GIF From Your Favorite Videos!** """)
gr.Markdown("""
### Now you ca get your own unlimited supply of cool GIFs and rections from the videos you most like..
**Motivation and background:** In this Gradio-Space Blog I will be taking you through my efforts in reproducing the brilliant app [Edit Video By Editing Text](https://huggingface.co/spaces/radames/edit-video-by-editing-text) by [@radames](https://huggingface.co/radames). My valule-add are -
- A permanent supply for your own new GIFs
- This Space written in the form of a Notebook or a Blog if I may, to help someone understand how they can too build this kind of an app.
**How To Use:** 1. Upload a video or simply click on the Shia LaBeouf's sample provided here.
2. Then click on 'Generate transcripts' button and first textbox will display the extract Transcript from the audio associated with your sample.
3. Clip the text from transcript or type manually in the second Textbox provided.
4. A .Gif image will get generated on the right hand side of animated Shia Labeouf!
Hopee you have fun using this π
""")
with gr.Row():
#for incoming video
input_video = gr.Video(label="Upload a Video", visible=True)
#to generate and display transcriptions for input video
text_transcript = gr.Textbox(label="Transcripts", lines = 10, interactive = True )
#Just to move dgata between function hence keeping visible false
text_words = gr.Textbox(visible=False)
text_wordstimestamps = gr.Textbox(visible=False)
#to copy paste required gif transcript / or to populate by itslef on pressing the button
text_gif_transcript = gr.Textbox(label="Transcripts", placeholder="Copy paste transcripts here to create GIF image" , lines = 3, interactive = True )
def load_gif_text(text):
print("****** inside load_gif_text() ******")
print("text for gif is : ", text)
return text
text_transcript.change(load_gif_text, text_transcript, text_gif_transcript )
out_gif = gr.Image(label="Generated GIF image")
with gr.Row():
button_transcript = gr.Button("Generate transcripts")
button_gifs = gr.Button("Create Gif")
with gr.Row():
#to render video example on mouse hover/click
examples.render()
#to load sample video into input_video upon clicking on it
def load_examples(video):
print("****** inside load_example() ******")
print("in_video is : ", video[0])
return video[0]
examples.click(load_examples, examples, input_video)
with gr.Row():
gr.Markdown(""" I will start with a short note on my understanding of Radames's app and tools used in it -
- His is a supercool and handy proof of concept of a simple video editor where you can edit a video by playing with its audio transcriptions (ASR pipeline output).
- Both of our apps uses **Huggingface's [Automatic Speech Recognition Pipeline]**(https://huggingface.co/tasks/automatic-speech-recognition) build over **Wav2Vec2** model which internally uses CTC to improve predictions. The pipeline allows you to predict text transcriptions along with the timestamps for every characters and pauses that are there in the audio text.
- His app uses FFmpeg library to a good extent to clip and merge videos. FFmpeg is an open-source library for video handling consisting of a suite of functions for handling video, audio, and other multimedia files. My app uses FFmpeg as well as Moviepy to do the bulk of video+audio processing.
Let me now briefly take you through the code and process involved in building this app *step by step* π lol -
- Firstly, I have used ffmpeg to extract audio from video (this code line is directly from Radames's above app) -
```
audio_memory, _ = ffmpeg.input(video_path).output('-', format="wav", ac=1, ar='16k').overwrite_output().global_args('-loglevel', 'quiet').run(capture_stdout=True)
```
- Then I am calling the ASR model as a service, using the Accelerated Inference API. Below is the code snippet for doing so -
```
def query(in_audio):
payload = json.dumps({ "inputs": base64.b64encode(in_audio).decode("utf-8"),
"parameters": {
"return_timestamps": "char",
"chunk_length_s": 10,
"stride_length_s": [4, 2]
},
"options": {"use_gpu": False}
}).encode("utf-8")
response = requests.request("POST", API_URL, data=payload)
json_response = json.loads(response.content.decode("utf-8"))
return json_response
```
- The transcript thus generated might have some words which are not correctly interpreted, for example, *tomorrow* is translated as 'to morrow', *hard at it* is translated as 'hot ati' and so on. However this won't hinder in the use-case I am demoing here, so we let's move on.
> do it just do it don't let your dreams be dreams yesterday you said to morrow so just do it make you dreams can't yro just do it some people dream of success while you're going to wake up and work hot ati nothing is impossible you should get to the point where any one else would quit and you're luck in a stop there no what are you waiting for do et jot do it just you can just do it if you're tired is starting over stop giving up
- The other output generated by this ASR pipeline is a list of character timestamps dictionaries, look at the below sample to get an idea -
```
{'text': 'D', 'timestamp': [2.36, 2.38]},
{'text': 'O', 'timestamp': [2.52, 2.56]},
{'text': ' ', 'timestamp': [2.68, 2.72]},
{'text': 'I', 'timestamp': [2.84, 2.86]},
{'text': 'T', 'timestamp': [2.88, 2.92]},
{'text': ' ', 'timestamp': [2.94, 2.98]},
{'text': 'J', 'timestamp': [4.48, 4.52]},
```
- Next, using these character timestamps I have extracted word timestamps (by taking the start_timestamp of the first letters and the en_timestamp of the last letter in any give word.
- Further when a *sub-transcript* is provided for the producing the GIF, I calculated the start and end timestamp for the whole group of words.
- I have then used *moviepy* library to extract / concat videos into smaller clips and also to save the final processed videofile as a.GIF image.
```
import moviepy.editor as mp
video = mp.VideoFileClip(video_path)
final_clip = video.subclip(start_seconds, end_seconds)
#writing to RAM
final_clip.write_gif("gifimage.gif") #, program='ffmpeg', tempfiles=True, fps=15, fuzz=3)
final_clip.write_videofile("gifimage.mp4")
final_clip.close()
```
While working on apps for [Gradio Blocks Party](https://huggingface.co/Gradio-Blocks) I have gained a tremendous amount of knowledge about Gradio and Huggingface APIs and infrastructure. I was also able to polish my understanding and learn new things on some of the key and most interesting ML aspects like Question Answering, Sentence Trnasformers, Summarization, Image Generation, LLMs, Prompt Engineering, and now ASR and Video processing.
I absolutely love Spaces, I believe Spaces is much more than a platform to showcase your ML demos. I suppose it can act like an ML Product Sandbox with the benefits whole of Huggingface might and infra behind it. I believe Spaces can become sort of a playground for future ML products and ideas. ALl of this is extremely exciting.
Thanks for reading so far, I will see you at my next submission. Keep learning and sharing.
My last two Gradio Blocks Party apps can be found here -
- [Gradio-Blocks/GPTJ6B_Poetry_LatentDiff_Illustration](https://huggingface.co/spaces/Gradio-Blocks/GPTJ6B_Poetry_LatentDiff_Illustration), and
- [Gradio-Blocks/Ask_Questions_To_YouTube_Videos](https://huggingface.co/spaces/Gradio-Blocks/Ask_Questions_To_YouTube_Videos)
""")
button_transcript.click(generate_transcripts, input_video, [text_transcript, text_words, text_wordstimestamps ])
button_gifs.click(generate_gifs, [text_gif_transcript, text_words, text_wordstimestamps], out_gif )
demo.launch(debug=True) |