File size: 6,829 Bytes
527d5e6
 
 
27f75e4
f79bf90
527d5e6
 
0bce71c
 
 
527d5e6
 
f09eeaa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
527d5e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f09eeaa
527d5e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f09eeaa
527d5e6
 
 
 
 
 
 
 
 
 
 
 
 
f09eeaa
527d5e6
 
 
f09eeaa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4940b1c
 
f09eeaa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4f08108
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import gradio as gr
import ffmpeg
from pathlib import Path
import os
import json

API_URL = "https://api-inference.huggingface.co/models/facebook/wav2vec2-base-960h"
#headers = {"Authorization": "Bearer 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"}
HF_TOKEN = os.environ["HF_TOKEN"]
headers = {"Authorization": f"Bearer {HF_TOKEN}"}


def generate_transcripts(in_video): #generate_gifs(in_video, gif_transcript):
    print("********* Inside generate_transcripts() **********")
    #convert video to audio
    print(f" input video is : {in_video}")
    
    video_path = Path("./ShiaLaBeouf.mp4")
    audio_memory, _ = ffmpeg.input(video_path).output('-', format="wav", ac=1, ar='16k').overwrite_output().global_args('-loglevel', 'quiet').run(capture_stdout=True)
    
    #Getting transcripts using wav2Vec2 huggingface hosted accelerated inference
    #sending audio file in request along with stride and chunk length information
    model_response = query_api(audio_memory)
    
    #model response has both - transcripts as well as character timestamps or chunks
    transcription = model_response["text"].lower()
    chnk = model_response["chunks"]
    
    #creating lists from chunks to consume downstream easily
    timestamps = [[chunk["text"].lower(), chunk["timestamp"][0], chunk["timestamp"][1]]
              for chunk in chnk]
    
    #getting words and word timestamps
    words, words_timestamp = get_word_timestamps(timestamps)
    print(f"Total words in the audio transcript is:{len(words)}, transcript word list is :{words}")
    print(f"Total Word timestamps derived fromcharacter timestamp are :{len(words_timestamp)}, Word timestamps are :{words_timestamp}")
    
    return transcription, words, words_timestamp
    
    
def generate_gifs(gif_transcript, words, words_timestamp):
    print("********* Inside generate_gifs() **********")
    
    #creating list from input gif transcript 
    gif = "don't let your dreams be dreams"
    #gif = gif_transcript
    giflist = gif.split()
    
    #getting gif indexes from the generator
    giflist_indxs = list(list(get_gif_word_indexes(words, giflist))[0])

    #getting start and end timestamps for a gif video
    start_seconds, end_seconds = get_gif_timestamps(giflist_indxs, words_timestamp)
    
    #generated .gif image
    generate_gif(start_seconds, end_seconds)
    #("./gifimage.gif")
    html_out = "<img src='./gifimage.gif' />"
            
    return html_out

    
#calling the hosted model
def query_api(audio_bytes: bytes):
    """
    Query for Huggingface Inference API for Automatic Speech Recognition task
    """
    payload = json.dumps({
        "inputs": base64.b64encode(audio_bytes).decode("utf-8"),
        "parameters": {
            "return_timestamps": "char",
            "chunk_length_s": 10,
            "stride_length_s": [4, 2]
        },
        "options": {"use_gpu": False}
    }).encode("utf-8")

    response = requests.request(
        "POST", API_URL, headers=headers, data=payload)
    json_reponse = json.loads(response.content.decode("utf-8"))
    return json_reponse


#getting word timestamps from character timestamps
def get_word_timestamps(timestamps): 
  words, word = [], []
  letter_timestamp, word_timestamp, words_timestamp = [], [], []
  for idx,entry in enumerate(timestamps):
    word.append(entry[0])
    letter_timestamp.append(entry[1])
    if entry[0] == ' ':
      words.append(''.join(word))
      word_timestamp.append(letter_timestamp[0])
      word_timestamp.append(timestamps[idx-1][2])
      words_timestamp.append(word_timestamp)
      word, word_timestamp, letter_timestamp = [], [], []

  words = [word.strip() for word in words]
  return words, words_timestamp


#getting index of gif words in main transcript
def get_gif_word_indexes(total_words_list, gif_words_list):  
    if not gif_words_list:
        return
    # just optimization
    lengthgif_words_list = len(gif_words_list)
    firstgif_words_list = gif_words_list[0]
    for idx, item in enumerate(total_words_list):
        if item == firstgif_words_list:
            if total_words_list[idx:idx+lengthgif_words_list] == gif_words_list:
                yield tuple(range(idx, idx+lengthgif_words_list))


#getting start and end timestamps for gif transcript
def get_gif_timestamps(giflist_indxs, words_timestamp):
  #giflist_indxs = list(list(get_gif_word_indexes(words, giflist))[0])
  min_idx = min(giflist_indxs)
  max_idx = max(giflist_indxs)

  gif_words_timestamp = words_timestamp[min_idx : max_idx+1]
  start_seconds, end_seconds = gif_words_timestamp[0][0], gif_words_timestamp[-1][-1]
  return start_seconds, end_seconds


#extracting the  video and building and serving a .gif image
def generate_gif(start_seconds, end_seconds):
  final_clip = video.subclip(start_seconds, end_seconds)
  #final_clip.write_videofile("/content/gdrive/My Drive/AI/videoedit/gif1.mp4")
  final_clip.write_gif("./gifimage.gif",)
  final_clip.close()
  return 


sample_video = ['./ShiaLaBeouf.mp4']
sample_vid = gr.Video(label='Video file')  #for displaying the example
examples = gr.components.Dataset(components=[sample_vid], samples=[sample_video], type='values')


demo = gr.Blocks()

with demo:
    with gr.Row():
        input_video = gr.Video(label="Upload a Video", visible=True)  #for incoming video
        text_transcript = gr.Textbox(label="Transcripts", lines = 10, interactive = True )  #to generate and display transcriptions for input video
        text_words = gr.Textbox(visible=False)
        text_wordstimestamps = gr.Textbox(visible=False)
        text_gif_transcript = gr.Textbox(label="Transcripts", placeholder="Copy paste transcripts here to create GIF image" , lines = 3, interactive = True ) #to copy paste required gif transcript
        out_gif = gr.HTML(label="Generated GIF from transcript selected", show_label=True)
        
        examples.render()
        def load_examples(video):  #to load sample video into input_video upon clicking on it
            print("****** inside load_example() ******")
            print("in_video is : ", video[0])
            return video[0]
        
        examples.click(load_examples, examples, input_video) 

    with gr.Row():
        button_transcript = gr.Button("Generate transcripts")
        button_gifs = gr.Button("Create Gif")
        
        #def load_gif():
        #    print("****** inside load_gif() ******")
        #    #created embedding  width='560' height='315' 
        #    html_out = "<img src='./gifimage.gif' />"
        #    print(f"html output is : {html_out}")
        #    return 

    button_transcript.click(generate_transcripts, input_video, [text_transcript, text_words, text_wordstimestamps ])
    button_gifs.click(generate_gifs, [text_gif_transcript, text_words, text_wordstimestamps], out_gif )
    
demo.launch(debug=True)