ysharma HF staff commited on
Commit
04e3933
1 Parent(s): c915ab7
Files changed (1) hide show
  1. app.py +7 -7
app.py CHANGED
@@ -18,18 +18,18 @@ def get_transcript(link):
18
  print(f"video id extracted is : {video_id}")
19
  transcript = YouTubeTranscriptApi.get_transcript(video_id)
20
  FinalTranscript = ' '.join([i['text'] for i in transcript])
21
- return FinalTranscript, video_id
22
 
23
 
24
  #input - question and transcript, output - answer timestamp
25
- def get_answers_timestamp(question, transcript):
26
  print("******** Inside get_answers_timestamp ********")
27
  model_ckpt = "deepset/minilm-uncased-squad2"
28
  tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
29
  #question = "any funny examples in video??"
30
- context = transcript
31
  print(f"Input Question is : {question}")
32
- print(f"Type of trancript is : {type(transcript)}, Length of transcript is : {len(transcript)}")
33
  inputs = tokenizer(question, context, return_overflowing_tokens=True, max_length=512, stride = 25)
34
 
35
  #overlaps
@@ -57,7 +57,7 @@ def get_answers_timestamp(question, transcript):
57
  #idxmax, idxmax2
58
 
59
  idxcont = lst[idxmax2]['start']
60
- answer = transcript[len(contx[0])-135 + idxcont:]
61
  sentence_keyword = answer[:50]
62
 
63
  dftranscript = pd.DataFrame(transcript)
@@ -85,11 +85,11 @@ def display_vid(url, question):
85
  #print(html)
86
 
87
  #get transcript
88
- transcript, video_id = get_transcript(url)
89
 
90
  #get answer timestamp
91
  #input - question and transcript, output - answer timestamp
92
- ans_timestamp = get_answers_timestamp(question, transcript)
93
 
94
  #created embedding
95
  #sample - smUHQndcmOY?start=234
 
18
  print(f"video id extracted is : {video_id}")
19
  transcript = YouTubeTranscriptApi.get_transcript(video_id)
20
  FinalTranscript = ' '.join([i['text'] for i in transcript])
21
+ return FinalTranscript,transcript, video_id
22
 
23
 
24
  #input - question and transcript, output - answer timestamp
25
+ def get_answers_timestamp(question, final_transcript, transcript):
26
  print("******** Inside get_answers_timestamp ********")
27
  model_ckpt = "deepset/minilm-uncased-squad2"
28
  tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
29
  #question = "any funny examples in video??"
30
+ context = final_transcript
31
  print(f"Input Question is : {question}")
32
+ print(f"Type of trancript is : {type(context)}, Length of transcript is : {len(context)}")
33
  inputs = tokenizer(question, context, return_overflowing_tokens=True, max_length=512, stride = 25)
34
 
35
  #overlaps
 
57
  #idxmax, idxmax2
58
 
59
  idxcont = lst[idxmax2]['start']
60
+ answer = final_transcript[len(contx[0])-135 + idxcont:]
61
  sentence_keyword = answer[:50]
62
 
63
  dftranscript = pd.DataFrame(transcript)
 
85
  #print(html)
86
 
87
  #get transcript
88
+ final_transcript, transcript, video_id = get_transcript(url)
89
 
90
  #get answer timestamp
91
  #input - question and transcript, output - answer timestamp
92
+ ans_timestamp = get_answers_timestamp(question, final_transcript, transcript)
93
 
94
  #created embedding
95
  #sample - smUHQndcmOY?start=234