AkashKhamkar commited on
Commit
e03b3f8
1 Parent(s): 7c768ea

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +187 -0
  2. requirements.txt +0 -0
app.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from youtube_transcript_api import YouTubeTranscriptApi
3
+ from transformers import AutoTokenizer, AutoModelWithLMHead
4
+ import torch
5
+ import nltk
6
+ nltk.download('wordnet')
7
+ nltk.download('punkt')
8
+ nltk.download('brown')
9
+ nltk.download('stopwords')
10
+ from nltk.tokenize import sent_tokenize
11
+ from flashtext import KeywordProcessor
12
+ from nltk.corpus import stopwords
13
+ from urllib import response
14
+ import requests
15
+ import string
16
+ import traceback
17
+ import pke
18
+
19
+ link = "http://127.0.0.1:8000/question"
20
+
21
+ summary_tokenizer = AutoTokenizer.from_pretrained("t5-base")
22
+ summary_model = AutoModelWithLMHead.from_pretrained("t5-base")
23
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
24
+ summary_model = summary_model.to(device)
25
+ question_model = AutoModelWithLMHead.from_pretrained('ramsrigouthamg/t5_squad_v1')
26
+ question_tokenizer = AutoTokenizer.from_pretrained('ramsrigouthamg/t5_squad_v1')
27
+ question_model = question_model.to(device)
28
+
29
+
30
+ def query(url, payload):
31
+ return requests.post(url, json=payload)
32
+
33
+ def fetch_transcript(url):
34
+ vid = url.split("=")[1]
35
+ transcript = YouTubeTranscriptApi.get_transcript(vid)
36
+ result = ""
37
+ for i in transcript:
38
+ result += ' ' + i['text']
39
+ return result
40
+
41
+ def postprocesstext (content):
42
+ final=""
43
+ for sent in sent_tokenize(content):
44
+ sent = sent.capitalize()
45
+ final = final +" "+sent
46
+ return final
47
+
48
+
49
+ def summarizer(text,model,tokenizer):
50
+ text = text.strip().replace("\n"," ")
51
+ text = "summarize: "+text
52
+ # print (text)
53
+ max_len = 512
54
+ encoding = tokenizer.encode_plus(text,max_length=max_len, pad_to_max_length=False,truncation=True, return_tensors="pt").to(device)
55
+
56
+ input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
57
+
58
+ outs = model.generate(input_ids=input_ids,
59
+ attention_mask=attention_mask,
60
+ early_stopping=True,
61
+ num_beams=3,
62
+ num_return_sequences=1,
63
+ no_repeat_ngram_size=2,
64
+ min_length = 75,
65
+ max_length=300)
66
+
67
+
68
+ dec = [tokenizer.decode(ids,skip_special_tokens=True) for ids in outs]
69
+ summary = dec[0]
70
+ summary = postprocesstext(summary)
71
+ summary= summary.strip()
72
+
73
+ return summary
74
+
75
+ def get_nouns_multipartite(content):
76
+ out=[]
77
+ try:
78
+ extractor = pke.unsupervised.MultipartiteRank()
79
+ stoplist = list(string.punctuation)
80
+ stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
81
+ stoplist += stopwords.words('english')
82
+ extractor.load_document(input=content, stoplist=stoplist)
83
+ # not contain punctuation marks or stopwords as candidates.
84
+ pos = {'PROPN','NOUN'}
85
+
86
+
87
+ extractor.candidate_selection(pos=pos)
88
+
89
+ extractor.candidate_weighting(alpha=1.1,
90
+ threshold=0.75,
91
+ method='average')
92
+ keyphrases = extractor.get_n_best(n=15)
93
+
94
+
95
+ for val in keyphrases:
96
+ out.append(val[0])
97
+ except:
98
+ out = []
99
+ traceback.print_exc()
100
+
101
+ return out
102
+
103
+ def get_keywords(originaltext,summarytext,count):
104
+ keywords = get_nouns_multipartite(originaltext)
105
+ print ("keywords unsummarized: ",keywords)
106
+ keyword_processor = KeywordProcessor()
107
+ for keyword in keywords:
108
+ keyword_processor.add_keyword(keyword)
109
+
110
+ keywords_found = keyword_processor.extract_keywords(summarytext)
111
+ keywords_found = list(set(keywords_found))
112
+ print ("keywords_found in summarized: ",keywords_found)
113
+
114
+ important_keywords =[]
115
+ for keyword in keywords:
116
+ if keyword in keywords_found:
117
+ important_keywords.append(keyword)
118
+
119
+ return important_keywords[:int(count)]
120
+
121
+ def get_question(context,answer,model,tokenizer):
122
+ text = "context: {} answer: {}".format(context,answer)
123
+ encoding = tokenizer.encode_plus(text,max_length=384, pad_to_max_length=False,truncation=True, return_tensors="pt").to(device)
124
+ input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
125
+
126
+ outs = model.generate(input_ids=input_ids,
127
+ attention_mask=attention_mask,
128
+ early_stopping=True,
129
+ num_beams=5,
130
+ num_return_sequences=1,
131
+ no_repeat_ngram_size=2,
132
+ max_length=72)
133
+
134
+
135
+ dec = [tokenizer.decode(ids,skip_special_tokens=True) for ids in outs]
136
+
137
+
138
+ Question = dec[0].replace("question:","")
139
+ Question= Question.strip()
140
+ return Question
141
+
142
+ def all(url,count):
143
+ transcript = fetch_transcript(url)
144
+ summarized_text = summarizer(transcript, summary_model, summary_tokenizer)
145
+ keywords = get_keywords(transcript,summarized_text,count)
146
+ qna = []
147
+ for answer in keywords:
148
+ qna.append(get_question(summarized_text,answer,question_model,question_tokenizer)+' : '+answer)
149
+
150
+ return qna
151
+
152
+
153
+
154
+ def main():
155
+
156
+ if 'submitted' not in st.session_state:
157
+ st.session_state.submitted = False
158
+
159
+ if 'opt' not in st.session_state:
160
+ st.session_state.opt = []
161
+
162
+ def callback():
163
+ st.session_state.submitted = True
164
+
165
+ st.title('QnA pair Generator')
166
+ url = st.text_input('Enter the Video Link')
167
+ count = st.text_input('Enter the number of questions you want to generate')
168
+
169
+ if (st.button("Submit URL", on_click=callback) and url and count) :
170
+ st.write("Thanks for submission !")
171
+ opt = all(url, count)
172
+ st.session_state.opt = opt
173
+
174
+ if st.session_state.submitted and st.session_state.opt:
175
+ option = st.multiselect('Select the question you want to add to database ', st.session_state.opt)
176
+ if option:
177
+ if st.button("Add question"):
178
+ for i in range(len(option)):
179
+ files = {
180
+ "question": option[i].split(":")[0],
181
+ "answer": option[i].split(":")[1]
182
+ }
183
+ response = query(link, files)
184
+ st.write(response.text)
185
+
186
+
187
+ main()
requirements.txt ADDED
Binary file (3.28 kB). View file