File size: 9,381 Bytes
2a78406
9c97bc4
 
 
 
 
 
90d19eb
2a78406
cc201b9
 
 
 
 
9c97bc4
 
e97bbe0
d93a889
9c97bc4
90a4d51
 
9c97bc4
 
 
04e3933
9c97bc4
 
 
04e3933
e97bbe0
672e886
04e3933
c4472df
04e3933
9c97bc4
 
 
 
 
 
 
 
 
 
 
 
 
 
4dfc7d2
 
9c97bc4
4dfc7d2
9c97bc4
 
 
 
c105afe
 
cc201b9
c105afe
9c97bc4
 
 
c105afe
cc201b9
 
9c97bc4
 
ae8c816
9c97bc4
 
cc201b9
 
 
 
 
 
001aea6
c3ed386
a6084be
e97bbe0
dc291ae
f25f59c
a6084be
14406c9
3af57d4
 
9c97bc4
94a93f1
1aa32a9
cc00947
9c97bc4
04e3933
9c97bc4
 
 
cc201b9
9c97bc4
5ce127f
e6d9153
9c97bc4
e6d9153
7bcb343
8bfdff7
 
 
 
 
 
cc201b9
9c5bf65
7bcb343
 
 
d5fb975
98603de
9c5bf65
2a78406
 
 
742f7ca
2a78406
bdb7bc2
10f6cda
ad4e68c
 
 
 
 
14406c9
 
 
412db82
14406c9
 
 
2a78406
 
e6d9153
5b2a72b
cc201b9
 
e6d9153
 
cc201b9
c3ed386
c1f572c
4ac9312
0c2ffa7
 
 
 
cc201b9
 
 
412db82
a6084be
14406c9
c5bbce0
2a78406
c3ed386
cc201b9
a6084be
2fc726f
 
 
 
 
 
5862540
e049118
5862540
2fc726f
2a78406
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import gradio as gr
from youtube_transcript_api import YouTubeTranscriptApi
from transformers import AutoTokenizer
from transformers import pipeline
from transformers import AutoModelForQuestionAnswering
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import torch 

model_ckpt = "deepset/minilm-uncased-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForQuestionAnswering.from_pretrained(model_ckpt)
modelST = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

#input - video link, output - full transcript
def get_transcript(link):
  print("******** Inside get_transcript ********")
  print(f"link to be extracted is : {link}")
  video_id = link.split("=")[1]
  # Handle additional query parameters such as timestamp, ...
  video_id = video_id.split("&")[0]
  print(f"video id extracted is : {video_id}")
  transcript = YouTubeTranscriptApi.get_transcript(video_id)
  FinalTranscript = ' '.join([i['text'] for i in transcript])
  return FinalTranscript,transcript, video_id
  
  
#input - question and transcript, output - answer timestamp
def get_answers_timestamp(question, final_transcript, transcript):
  print("******** Inside get_answers_timestamp ********")

  context = final_transcript
  print(f"Input Question is : {question}")
  print(f"Type of trancript is : {type(context)}, Length of transcript is : {len(context)}")
  inputs = tokenizer(question, context, return_overflowing_tokens=True, max_length=512, stride = 25)

  #getting a list of contexts available after striding
  contx=[]
  for window in inputs["input_ids"]:
      #print(f"{tokenizer.decode(window)} \n")
      contx.append(tokenizer.decode(window).split('[SEP]')[1].strip())
  #print(ques)
  #print(contx)

  lst=[]
  pipe = pipeline("question-answering", model=model, tokenizer=tokenizer)
  for contexts in contx:
    lst.append(pipe(question=question, context=contexts))
  
  print(f"contx list is : {contx}")
  lst_scores = [dicts['score'] for dicts in lst] 
  print(f"lst_scores is : {lst_scores}")
  #getting highest and second highest scores
  idxmax = lst_scores.index(max(lst_scores))
  lst_scores.remove(max(lst_scores))
  idxmax2 = lst_scores.index(max(lst_scores))
  
  sentence_for_timestamp = lst[idxmax]['answer']
  sentence_for_timestamp_secondbest = lst[idxmax2]['answer']
  
  dftranscript = pd.DataFrame(transcript)

  embedding_1= modelST.encode(dftranscript.text, convert_to_tensor=True)
  embedding_2 = modelST.encode(sentence_for_timestamp, convert_to_tensor=True)
  embedding_3 = modelST.encode(sentence_for_timestamp_secondbest, convert_to_tensor=True)
  
  similarity_tensor = util.pytorch_cos_sim(embedding_1, embedding_2)
  idx = torch.argmax(similarity_tensor)
  start_timestamp = dftranscript.iloc[[int(idx)-3]].start.values[0]
  start_timestamp = round(start_timestamp)

  similarity_tensor_secondbest = util.pytorch_cos_sim(embedding_1, embedding_3)
  idx_secondbest = torch.argmax(similarity_tensor_secondbest)
  start_timestamp_secondbest = dftranscript.iloc[[int(idx_secondbest)-3]].start.values[0]
  start_timestamp_secondbest = round(start_timestamp_secondbest)

  return start_timestamp, start_timestamp_secondbest
   
    
def display_vid(url, question, sample_question=None, example_video=None):
  print("******** display_vid ********")
  if question == '':
    question = sample_question
  
  #get embedding and youtube link for initial video
  html_in = "<iframe width='560' height='315' src=" + url + " frameborder='0' allowfullscreen></iframe>"
  #print(html)
  
  if len(example_video) !=0 : #is not None:
    print(f"example_video is  : {example_video}")
    url = example_video[0]
  #get transcript
  final_transcript, transcript, video_id = get_transcript(url)
  
  #get answer timestamp
  #input - question and transcript, output - answer timestamp
  ans_timestamp, ans_timestamp_secondbest = get_answers_timestamp(question, final_transcript, transcript)
  
  #created embedding  width='560' height='315' 
  html_out = "<iframe width='730' height='400' src='https://www.youtube.com/embed/" + video_id + "?start=" + str(ans_timestamp) + "' title='YouTube video player' frameborder='0' allow='accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture' allowfullscreen></iframe>"
  print(f"html output is : {html_out}")
  html_out_secondbest = "<iframe width='730' height='400' src='https://www.youtube.com/embed/" + video_id + "?start=" + str(ans_timestamp_secondbest) + "' title='YouTube video player' frameborder='0' allow='accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture' allowfullscreen></iframe>"
  
  if question == '':
    print(f"Inside display_vid(), Sample_Question coming from Radio box is BEFORE : {sample_question}")
    sample_ques = set_example_question(sample_question)
    print(f"Inside display_vid(), Sample Question coming from Radio box is AFTER : {sample_ques}")
  else:
    sample_ques = question
  return html_out, html_out_secondbest, sample_ques, url

def set_example_question(sample_question):
    print(f"******* Inside Sample Questions ********")
    print(f"Sample Question coming from Radio box is : {sample_question}")
    print("What is the Return value : {gr.Radio.update(value=sample_question)}")
    return gr.Radio.update(value=sample_question) #input_ques.update(example)

demo = gr.Blocks()

with demo:
  gr.Markdown("<h1><center>Ask a Question to a YouTube Video and get the Video played from the answer timestamp</center></h1>")
  gr.Markdown(
        """### A Space by [Yuvraj Sharma](https://huggingface.co/ysharma). How many times have you seen a long video/podcast on Youtube and wondered only if there would have been 'explanatory' timestamps it would have been so much better..
        **How to use this space:** You can either provide a new YouTube video link or can use the sample video link provided. Then provide a Questions that you would like about exploring the content in the given video.
        The App will generate timestamps and Play the video at those timestamps for you in the space provided. You will see two video displays, corresponding to two of the best guesses by the underlying models. Chances are that both videos might start with same timestamp, which will depend on the question and the content in the video, please bear! 
        Also, couple small caveats -
        - The App will perform as good as the available English Transcripts are for the given YouTube Video. If there are no transcripts, the App will not work. 
        - Please make sure the YouTube video links that you paste here don't have the trailing values like *&t=8077s*
        - Lastly, once you have queried a video, you might have to refresh the page for next query (will try and fix this)
        
        **Motivation behind building this App:** When we see a long video without timestamps, we often wonder 'if' the content we are looking for is in there, or 'where' in the video is the content we are looking for? The Idea is that we might have questions like 'Is the xxxx thing covered in this video?', or maybe 'does the host talks about the architecture of the xxxxx model', or maybe 'Does host talk about alien doorway on Mars?' and so on.
        
        **So this App could help you in reaching to that timestamp in 'Record time'!** 
        
        **Best part:** You don't even have to move away from the Space tab in your browser as the YouTube video gets played within the given View.
        """
    )
  with gr.Row():
    input_url = gr.Textbox(label="Input a Youtube video link") 
    input_ques = gr.Textbox(label="Ask a Question")

  with gr.Row():
    output_vid = gr.HTML(label="Video from timestamp 1", show_label=True)
    output_vid_secondbest = gr.HTML(label="Video from timestamp 2", show_label=True)
    
  with gr.Row():
    example_question = gr.Dropdown(
                    ["Choose a sample question", "Does video talk about different modalities", 
                    "does the model uses perceiver architecture?",
                    "when does the video talk about locked image tuning or lit?",
                    "comparison between gpt3 and jurassic?",
                    "Has flamingo passed turing test yet?",
                    "Any funny examples in video?",
                    "is it possible to download the stylegan model?",
                    "what was very cool?",
                    "what is the cool library?"], label= "Choose a sample Question", value=None)
  with gr.Row():
    example_video = gr.CheckboxGroup( ["https://www.youtube.com/watch?v=smUHQndcmOY"], label= "Choose a sample YouTube video") 
                                                                    
  b1 = gr.Button("Publish Video")
  
  b1.click(display_vid, inputs=[input_url, input_ques, example_question, example_video], outputs=[output_vid, output_vid_secondbest, input_ques, input_url])
  
  with gr.Row():
    gr.Markdown('''
    #### Model Credits
    1. [Question Answering](https://huggingface.co/deepset/minilm-uncased-squad2)
    1. [Sentence Transformer](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)
    ''')
  
  with gr.Row(): 
    gr.Markdown("![visitor badge](https://visitor-badge.glitch.me/badge?page_id=gradio-blocks_ask_questions_to_youtube_videos)")

demo.launch(enable_queue=True, debug=True)