Spaces:
Build error
Build error
File size: 9,838 Bytes
527d5e6 27f75e4 b0a9549 f79bf90 edaaf4f c3d46df 7dbd9b3 ad3ea9b e37d735 ad3ea9b b0a9549 527d5e6 8af272d 527d5e6 f09eeaa b0a9549 f09eeaa 28d279e f09eeaa 28d279e 6010350 92daced 28d279e b69cb8c 92daced ffcd8ed f09eeaa 28c97fa f09eeaa 4a2e5d1 f09eeaa e37d735 aadddd8 dc58fb1 ac9067f 489b6f9 e37d735 f09eeaa 527d5e6 b0a9549 527d5e6 b0a9549 527d5e6 f09eeaa 527d5e6 0012032 527d5e6 8f567e8 527d5e6 0012032 527d5e6 0012032 ffcd8ed 527d5e6 8f567e8 ffcd8ed 527d5e6 0012032 527d5e6 0012032 9ccc595 527d5e6 f09eeaa 28c97fa 527d5e6 28c97fa 6010350 527d5e6 d10d751 28c97fa 527d5e6 28c97fa 527d5e6 7dbd9b3 1a39558 2a93029 29cbcda 527d5e6 d7a58bc 1a39558 203beb3 d7a58bc a29f7de cecb879 203beb3 eba7c5b 48af044 eba7c5b c6ba8f9 103e0de cecb879 48af044 e37d735 1a39558 e37d735 527d5e6 a8fde07 203beb3 8b21877 203beb3 f09eeaa fb66c76 f09eeaa 209d5da f09eeaa 4940b1c f09eeaa 209d5da 489b6f9 fd80906 422f0f3 209d5da f09eeaa 849dabd f09eeaa 209d5da 4f08108 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 |
import gradio as gr
import ffmpeg
from pathlib import Path
import os
import ast
import json
import base64
import requests
import moviepy.editor as mp
from PIL import Image, ImageSequence
import cv2
API_URL = "https://api-inference.huggingface.co/models/facebook/wav2vec2-base-960h"
headers = {"Authorization": "Bearer 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"}
#HF_TOKEN = os.environ["HF_TOKEN"]
#headers = {"Authorization": f"Bearer {HF_TOKEN}"}
def generate_transcripts(in_video): #generate_gifs(in_video, gif_transcript):
print("********* Inside generate_transcripts() **********")
#convert video to audio
print(f" input video is : {in_video}")
video_path = Path("./ShiaLaBeouf.mp4")
audio_memory, _ = ffmpeg.input(video_path).output('-', format="wav", ac=1, ar='16k').overwrite_output().global_args('-loglevel', 'quiet').run(capture_stdout=True)
#Getting transcripts using wav2Vec2 huggingface hosted accelerated inference
#sending audio file in request along with stride and chunk length information
model_response = query_api(audio_memory)
#model response has both - transcripts as well as character timestamps or chunks
print(f"model_response is : {model_response}")
transcription = model_response["text"].lower()
chnk = model_response["chunks"]
#creating lists from chunks to consume downstream easily
timestamps = [[chunk["text"].lower(), chunk["timestamp"][0], chunk["timestamp"][1]]
for chunk in chnk]
#getting words and word timestamps
words, words_timestamp = get_word_timestamps(timestamps)
print(f"Total words in the audio transcript is:{len(words)}, transcript word list is :{words}, type of words is :{type(words)} ")
print(f"Total Word timestamps derived fromcharacter timestamp are :{len(words_timestamp)}, Word timestamps are :{words_timestamp}")
return transcription, words, words_timestamp
def generate_gifs(gif_transcript, words, words_timestamp):
print("********* Inside generate_gifs() **********")
#creating list from input gif transcript
gif = "don't let your dreams be dreams"
#gif = gif_transcript
giflist = gif.split()
#getting gif indexes from the generator
# Converting string to list
words = ast.literal_eval(words)
words_timestamp = ast.literal_eval(words_timestamp)
print(f"words is :{words}")
print(f"type of words is :{type(words)}")
print(f"length of words is :{len(words)}")
print(f"giflist is :{giflist}")
#print(f"haystack and needle function returns value as : {list(get_gif_word_indexes(words, giflist))}")
#indx_tmp = [num for num in get_gif_word_indexes(words, giflist)]
#print(f"index temp is : {indx_tmp}")
giflist_indxs = list(list(get_gif_word_indexes(words, giflist))[0])
print(f"giflist_indxs is : {giflist_indxs}")
#getting start and end timestamps for a gif video
start_seconds, end_seconds = get_gif_timestamps(giflist_indxs, words_timestamp)
print(f"start_seconds, end_seconds are : ({start_seconds}, {end_seconds})")
#generated .gif image
im, gif_img, gif_vid, vid_cap = gen_moviepy_gif(start_seconds, end_seconds)
im.save('./gifimage1.gif', save_all=True)
#gif_img = gen_moviepy_gif(start_seconds, end_seconds)
#gif_img = f"./gifimage.gif"
#html_out = "<img src= '" + gif_img + "' alt='create a gif from video' width='100%'/>"
#print("html out is :", html_out)
return vid_cap #gif_vid
#calling the hosted model
def query_api(audio_bytes: bytes):
"""
Query for Huggingface Inference API for Automatic Speech Recognition task
"""
print("********* Inside query_api() **********")
payload = json.dumps({
"inputs": base64.b64encode(audio_bytes).decode("utf-8"),
"parameters": {
"return_timestamps": "char",
"chunk_length_s": 10,
"stride_length_s": [4, 2]
},
"options": {"use_gpu": False}
}).encode("utf-8")
response = requests.request(
"POST", API_URL, headers=headers, data=payload)
json_reponse = json.loads(response.content.decode("utf-8"))
print(f"json_reponse is :{json_reponse}")
return json_reponse
#getting word timestamps from character timestamps
def get_word_timestamps(timestamps):
words, word = [], []
letter_timestamp, word_timestamp, words_timestamp = [], [], []
for idx,entry in enumerate(timestamps):
word.append(entry[0])
letter_timestamp.append(entry[1])
if entry[0] == ' ':
words.append(''.join(word))
word_timestamp.append(letter_timestamp[0])
word_timestamp.append(timestamps[idx-1][2])
words_timestamp.append(word_timestamp)
word, word_timestamp, letter_timestamp = [], [], []
words = [word.strip() for word in words]
return words, words_timestamp
#getting index of gif words in main transcript
def get_gif_word_indexes(total_words_list, gif_words_list):
if not gif_words_list:
print("THIS IS 1")
return
# just optimization
COUNT=0
lengthgif_words_list = len(gif_words_list)
print("THIS IS 2")
firstgif_words_list = gif_words_list[0]
print("THIS IS 3")
print(f"total_words_list is :{total_words_list}")
print(f"length of total_words_list is :{len(total_words_list)}")
print(f"gif_words_list is :{gif_words_list}")
print(f"length of gif_words_list is :{len(gif_words_list)}")
for idx, item in enumerate(total_words_list):
COUNT+=1
#print("COUNT IS :", COUNT)
if item == firstgif_words_list:
print("THIS IS 5")
if total_words_list[idx:idx+lengthgif_words_list] == gif_words_list:
print("THIS IS 6")
print(f"value 1 is: {range(idx, idx+lengthgif_words_list)}")
print(f"value of tuple is : {tuple(range(idx, idx+lengthgif_words_list))}")
yield tuple(range(idx, idx+lengthgif_words_list))
#getting start and end timestamps for gif transcript
def get_gif_timestamps(giflist_indxs, words_timestamp):
print(f"******** Inside get_gif_timestamps() **********")
#giflist_indxs = list(list(get_gif_word_indexes(words, giflist))[0])
min_idx = min(giflist_indxs)
max_idx = max(giflist_indxs)
print(f"min_idx is :{min_idx}")
print(f"max_idx is :{max_idx}")
print(f"type of words_timestamp is :{type(words_timestamp)}")
gif_words_timestamp = words_timestamp[min_idx : max_idx+1]
print(f"words_timestamp is :{words_timestamp}")
print(f"gif_words_timestamp is :{gif_words_timestamp}")
start_seconds, end_seconds = gif_words_timestamp[0][0], gif_words_timestamp[-1][-1]
print(f"start_seconds, end_seconds are :{start_seconds},{end_seconds}")
return start_seconds, end_seconds
#extracting the video and building and serving a .gif image
def gen_moviepy_gif(start_seconds, end_seconds):
print("******** inside moviepy_gif () ***************")
video_path = "./ShiaLaBeouf.mp4"
video = mp.VideoFileClip(video_path) #.resize(0.3)
final_clip = video.subclip(start_seconds, end_seconds)
#final_clip.write_videofile("gifimage.mp4")
print("I am here now")
#gifclip = VideoFileClip("gifimage.mp4")
final_clip.write_gif("./gifimage.gif") #, program='ffmpeg', tempfiles=True, fps=15, fuzz=3)
final_clip.close()
print("pretty good")
gif_img = mp.VideoFileClip("gifimage.gif")
print(gif_img)
gif_img.write_videofile("gifimage.mp4")
gif_vid = mp.VideoFileClip("gifimage.mp4")
im = Image.open("gifimage.gif")
vid_cap = cv2.VideoCapture('gifimage.mp4')
print("At the very end")
return im, gif_img, gif_vid, vid_cap
# showing gif
#gif.ipython_display()
sample_video = ['./ShiaLaBeouf.mp4']
sample_vid = gr.Video(label='Video file') #for displaying the example
examples = gr.components.Dataset(components=[sample_vid], samples=[sample_video], type='values')
demo = gr.Blocks()
with demo:
gr.Markdown("""This app is still a work in progress..""")
with gr.Row():
input_video = gr.Video(label="Upload a Video", visible=True) #for incoming video
text_transcript = gr.Textbox(label="Transcripts", lines = 10, interactive = True ) #to generate and display transcriptions for input video
text_words = gr.Textbox(visible=False)
text_wordstimestamps = gr.Textbox(visible=False)
text_gif_transcript = gr.Textbox(label="Transcripts", placeholder="Copy paste transcripts here to create GIF image" , lines = 3, interactive = True ) #to copy paste required gif transcript
#out_gif = gr.HTML(label="Generated GIF from transcript selected", show_label=True)
examples.render()
def load_examples(video): #to load sample video into input_video upon clicking on it
print("****** inside load_example() ******")
print("in_video is : ", video[0])
return video[0]
examples.click(load_examples, examples, input_video)
with gr.Row():
button_transcript = gr.Button("Generate transcripts")
button_gifs = gr.Button("Create Gif")
#def load_gif():
# print("****** inside load_gif() ******")
# #created embedding width='560' height='315'
# html_out = "<img src='./gifimage.gif' />"
# print(f"html output is : {html_out}")
# return
with gr.Row():
#out_gif = gr.HTML(label="Generated GIF from transcript selected", show_label=True)
#gr.Markdown(""" [] """)
out_gif = gr.Video()
button_transcript.click(generate_transcripts, input_video, [text_transcript, text_words, text_wordstimestamps ])
button_gifs.click(generate_gifs, [text_gif_transcript, text_words, text_wordstimestamps], out_gif )
demo.launch(debug=True) |