Spaces:
Build error
Build error
File size: 6,859 Bytes
527d5e6 27f75e4 f79bf90 edaaf4f c3d46df 527d5e6 0bce71c 527d5e6 f09eeaa 527d5e6 f09eeaa 527d5e6 f09eeaa 527d5e6 f09eeaa 527d5e6 f09eeaa 4940b1c f09eeaa 4f08108 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
import gradio as gr
import ffmpeg
from pathlib import Path
import os
import json
import base64
import requests
API_URL = "https://api-inference.huggingface.co/models/facebook/wav2vec2-base-960h"
#headers = {"Authorization": "Bearer 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"}
HF_TOKEN = os.environ["HF_TOKEN"]
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
def generate_transcripts(in_video): #generate_gifs(in_video, gif_transcript):
print("********* Inside generate_transcripts() **********")
#convert video to audio
print(f" input video is : {in_video}")
video_path = Path("./ShiaLaBeouf.mp4")
audio_memory, _ = ffmpeg.input(video_path).output('-', format="wav", ac=1, ar='16k').overwrite_output().global_args('-loglevel', 'quiet').run(capture_stdout=True)
#Getting transcripts using wav2Vec2 huggingface hosted accelerated inference
#sending audio file in request along with stride and chunk length information
model_response = query_api(audio_memory)
#model response has both - transcripts as well as character timestamps or chunks
transcription = model_response["text"].lower()
chnk = model_response["chunks"]
#creating lists from chunks to consume downstream easily
timestamps = [[chunk["text"].lower(), chunk["timestamp"][0], chunk["timestamp"][1]]
for chunk in chnk]
#getting words and word timestamps
words, words_timestamp = get_word_timestamps(timestamps)
print(f"Total words in the audio transcript is:{len(words)}, transcript word list is :{words}")
print(f"Total Word timestamps derived fromcharacter timestamp are :{len(words_timestamp)}, Word timestamps are :{words_timestamp}")
return transcription, words, words_timestamp
def generate_gifs(gif_transcript, words, words_timestamp):
print("********* Inside generate_gifs() **********")
#creating list from input gif transcript
gif = "don't let your dreams be dreams"
#gif = gif_transcript
giflist = gif.split()
#getting gif indexes from the generator
giflist_indxs = list(list(get_gif_word_indexes(words, giflist))[0])
#getting start and end timestamps for a gif video
start_seconds, end_seconds = get_gif_timestamps(giflist_indxs, words_timestamp)
#generated .gif image
generate_gif(start_seconds, end_seconds)
#("./gifimage.gif")
html_out = "<img src='./gifimage.gif' />"
return html_out
#calling the hosted model
def query_api(audio_bytes: bytes):
"""
Query for Huggingface Inference API for Automatic Speech Recognition task
"""
payload = json.dumps({
"inputs": base64.b64encode(audio_bytes).decode("utf-8"),
"parameters": {
"return_timestamps": "char",
"chunk_length_s": 10,
"stride_length_s": [4, 2]
},
"options": {"use_gpu": False}
}).encode("utf-8")
response = requests.request(
"POST", API_URL, headers=headers, data=payload)
json_reponse = json.loads(response.content.decode("utf-8"))
return json_reponse
#getting word timestamps from character timestamps
def get_word_timestamps(timestamps):
words, word = [], []
letter_timestamp, word_timestamp, words_timestamp = [], [], []
for idx,entry in enumerate(timestamps):
word.append(entry[0])
letter_timestamp.append(entry[1])
if entry[0] == ' ':
words.append(''.join(word))
word_timestamp.append(letter_timestamp[0])
word_timestamp.append(timestamps[idx-1][2])
words_timestamp.append(word_timestamp)
word, word_timestamp, letter_timestamp = [], [], []
words = [word.strip() for word in words]
return words, words_timestamp
#getting index of gif words in main transcript
def get_gif_word_indexes(total_words_list, gif_words_list):
if not gif_words_list:
return
# just optimization
lengthgif_words_list = len(gif_words_list)
firstgif_words_list = gif_words_list[0]
for idx, item in enumerate(total_words_list):
if item == firstgif_words_list:
if total_words_list[idx:idx+lengthgif_words_list] == gif_words_list:
yield tuple(range(idx, idx+lengthgif_words_list))
#getting start and end timestamps for gif transcript
def get_gif_timestamps(giflist_indxs, words_timestamp):
#giflist_indxs = list(list(get_gif_word_indexes(words, giflist))[0])
min_idx = min(giflist_indxs)
max_idx = max(giflist_indxs)
gif_words_timestamp = words_timestamp[min_idx : max_idx+1]
start_seconds, end_seconds = gif_words_timestamp[0][0], gif_words_timestamp[-1][-1]
return start_seconds, end_seconds
#extracting the video and building and serving a .gif image
def generate_gif(start_seconds, end_seconds):
final_clip = video.subclip(start_seconds, end_seconds)
#final_clip.write_videofile("/content/gdrive/My Drive/AI/videoedit/gif1.mp4")
final_clip.write_gif("./gifimage.gif",)
final_clip.close()
return
sample_video = ['./ShiaLaBeouf.mp4']
sample_vid = gr.Video(label='Video file') #for displaying the example
examples = gr.components.Dataset(components=[sample_vid], samples=[sample_video], type='values')
demo = gr.Blocks()
with demo:
with gr.Row():
input_video = gr.Video(label="Upload a Video", visible=True) #for incoming video
text_transcript = gr.Textbox(label="Transcripts", lines = 10, interactive = True ) #to generate and display transcriptions for input video
text_words = gr.Textbox(visible=False)
text_wordstimestamps = gr.Textbox(visible=False)
text_gif_transcript = gr.Textbox(label="Transcripts", placeholder="Copy paste transcripts here to create GIF image" , lines = 3, interactive = True ) #to copy paste required gif transcript
out_gif = gr.HTML(label="Generated GIF from transcript selected", show_label=True)
examples.render()
def load_examples(video): #to load sample video into input_video upon clicking on it
print("****** inside load_example() ******")
print("in_video is : ", video[0])
return video[0]
examples.click(load_examples, examples, input_video)
with gr.Row():
button_transcript = gr.Button("Generate transcripts")
button_gifs = gr.Button("Create Gif")
#def load_gif():
# print("****** inside load_gif() ******")
# #created embedding width='560' height='315'
# html_out = "<img src='./gifimage.gif' />"
# print(f"html output is : {html_out}")
# return
button_transcript.click(generate_transcripts, input_video, [text_transcript, text_words, text_wordstimestamps ])
button_gifs.click(generate_gifs, [text_gif_transcript, text_words, text_wordstimestamps], out_gif )
demo.launch(debug=True) |