Rehman1603 commited on
Commit
e84a10b
1 Parent(s): bc125e7

Create mcq.py

Browse files
Files changed (1) hide show
  1. mcq.py +305 -0
mcq.py ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np # linear algebra
2
+ import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
3
+ import time
4
+ import torch
5
+ from transformers import T5ForConditionalGeneration,T5Tokenizer
6
+ import random
7
+ import spacy
8
+ import zipfile
9
+ import os
10
+ import json
11
+ from sense2vec import Sense2Vec
12
+ import requests
13
+ from collections import OrderedDict
14
+ import string
15
+ import pke
16
+ import nltk
17
+ from nltk import FreqDist
18
+ nltk.download('brown')
19
+ nltk.download('stopwords')
20
+ nltk.download('popular')
21
+ from nltk.corpus import stopwords
22
+ from nltk.corpus import brown
23
+ from similarity.normalized_levenshtein import NormalizedLevenshtein
24
+ from nltk.tokenize import sent_tokenize
25
+ from flashtext import KeywordProcessor
26
+
27
+ def MCQs_available(word,s2v):
28
+ word = word.replace(" ", "_")
29
+ sense = s2v.get_best_sense(word)
30
+ if sense is not None:
31
+ return True
32
+ else:
33
+ return False
34
+
35
+
36
+ def edits(word):
37
+ "All edits that are one edit away from `word`."
38
+ letters = 'abcdefghijklmnopqrstuvwxyz '+string.punctuation
39
+ splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
40
+ deletes = [L + R[1:] for L, R in splits if R]
41
+ transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
42
+ replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
43
+ inserts = [L + c + R for L, R in splits for c in letters]
44
+ return set(deletes + transposes + replaces + inserts)
45
+
46
+
47
+ def sense2vec_get_words(word,s2v):
48
+ output = []
49
+
50
+ word_preprocessed = word.translate(word.maketrans("","", string.punctuation))
51
+ word_preprocessed = word_preprocessed.lower()
52
+
53
+ word_edits = edits(word_preprocessed)
54
+
55
+ word = word.replace(" ", "_")
56
+
57
+ sense = s2v.get_best_sense(word)
58
+ most_similar = s2v.most_similar(sense, n=15)
59
+
60
+ compare_list = [word_preprocessed]
61
+ for each_word in most_similar:
62
+ append_word = each_word[0].split("|")[0].replace("_", " ")
63
+ append_word = append_word.strip()
64
+ append_word_processed = append_word.lower()
65
+ append_word_processed = append_word_processed.translate(append_word_processed.maketrans("","", string.punctuation))
66
+ if append_word_processed not in compare_list and word_preprocessed not in append_word_processed and append_word_processed not in word_edits:
67
+ output.append(append_word.title())
68
+ compare_list.append(append_word_processed)
69
+
70
+
71
+ out = list(OrderedDict.fromkeys(output))
72
+
73
+ return out
74
+
75
+ def get_options(answer,s2v):
76
+ distractors =[]
77
+
78
+ try:
79
+ distractors = sense2vec_get_words(answer,s2v)
80
+ if len(distractors) > 0:
81
+ print(" Sense2vec_distractors successful for word : ", answer)
82
+ return distractors,"sense2vec"
83
+ except:
84
+ print (" Sense2vec_distractors failed for word : ",answer)
85
+
86
+
87
+ return distractors,"None"
88
+
89
+ def tokenize_sentences(text):
90
+ sentences = [sent_tokenize(text)]
91
+ sentences = [y for x in sentences for y in x]
92
+ # Remove any short sentences less than 20 letters.
93
+ sentences = [sentence.strip() for sentence in sentences if len(sentence) > 20]
94
+ return sentences
95
+
96
+
97
+ def get_sentences_for_keyword(keywords, sentences):
98
+ keyword_processor = KeywordProcessor()
99
+ keyword_sentences = {}
100
+ for word in keywords:
101
+ word = word.strip()
102
+ keyword_sentences[word] = []
103
+ keyword_processor.add_keyword(word)
104
+ for sentence in sentences:
105
+ keywords_found = keyword_processor.extract_keywords(sentence)
106
+ for key in keywords_found:
107
+ keyword_sentences[key].append(sentence)
108
+
109
+ for key in keyword_sentences.keys():
110
+ values = keyword_sentences[key]
111
+ values = sorted(values, key=len, reverse=True)
112
+ keyword_sentences[key] = values
113
+
114
+ delete_keys = []
115
+ for k in keyword_sentences.keys():
116
+ if len(keyword_sentences[k]) == 0:
117
+ delete_keys.append(k)
118
+ for del_key in delete_keys:
119
+ del keyword_sentences[del_key]
120
+
121
+ return keyword_sentences
122
+
123
+
124
+ def is_far(words_list,currentword,thresh,normalized_levenshtein):
125
+ threshold = thresh
126
+ score_list =[]
127
+ for word in words_list:
128
+ score_list.append(normalized_levenshtein.distance(word.lower(),currentword.lower()))
129
+ if min(score_list)>=threshold:
130
+ return True
131
+ else:
132
+ return False
133
+
134
+ def filter_phrases(phrase_keys,max,normalized_levenshtein ):
135
+ filtered_phrases =[]
136
+ if len(phrase_keys)>0:
137
+ filtered_phrases.append(phrase_keys[0])
138
+ for ph in phrase_keys[1:]:
139
+ if is_far(filtered_phrases,ph,0.7,normalized_levenshtein ):
140
+ filtered_phrases.append(ph)
141
+ if len(filtered_phrases)>=max:
142
+ break
143
+ return filtered_phrases
144
+
145
+
146
+ def get_nouns_multipartite(text):
147
+ out = []
148
+
149
+ extractor = pke.unsupervised.MultipartiteRank()
150
+ extractor.load_document(input=text, language='en')
151
+ pos = {'PROPN', 'NOUN'}
152
+ stoplist = list(string.punctuation)
153
+ stoplist += stopwords.words('english')
154
+ extractor.candidate_selection(pos=pos)
155
+ # 4. build the Multipartite graph and rank candidates using random walk,
156
+ # alpha controls the weight adjustment mechanism, see TopicRank for
157
+ # threshold/method parameters.
158
+ try:
159
+ extractor.candidate_weighting(alpha=1.1,
160
+ threshold=0.75,
161
+ method='average')
162
+ except:
163
+ return out
164
+
165
+ keyphrases = extractor.get_n_best(n=10)
166
+
167
+ for key in keyphrases:
168
+ out.append(key[0])
169
+
170
+ return out
171
+
172
+
173
+ def get_phrases(doc):
174
+ phrases={}
175
+ for np in doc.noun_chunks:
176
+ phrase =np.text
177
+ len_phrase = len(phrase.split())
178
+ if len_phrase > 1:
179
+ if phrase not in phrases:
180
+ phrases[phrase]=1
181
+ else:
182
+ phrases[phrase]=phrases[phrase]+1
183
+
184
+ phrase_keys=list(phrases.keys())
185
+ phrase_keys = sorted(phrase_keys, key= lambda x: len(x),reverse=True)
186
+ phrase_keys=phrase_keys[:50]
187
+ return phrase_keys
188
+
189
+
190
+
191
+ def get_keywords(nlp,text,max_keywords,s2v,fdist,normalized_levenshtein,no_of_sentences):
192
+ doc = nlp(text)
193
+ max_keywords = int(max_keywords)
194
+
195
+ keywords = get_nouns_multipartite(text)
196
+ keywords = sorted(keywords, key=lambda x: fdist[x])
197
+ keywords = filter_phrases(keywords, max_keywords,normalized_levenshtein )
198
+
199
+ phrase_keys = get_phrases(doc)
200
+ filtered_phrases = filter_phrases(phrase_keys, max_keywords,normalized_levenshtein )
201
+
202
+ total_phrases = keywords + filtered_phrases
203
+
204
+ total_phrases_filtered = filter_phrases(total_phrases, min(max_keywords, 2*no_of_sentences),normalized_levenshtein )
205
+
206
+
207
+ answers = []
208
+ for answer in total_phrases_filtered:
209
+ if answer not in answers and MCQs_available(answer,s2v):
210
+ answers.append(answer)
211
+
212
+ answers = answers[:max_keywords]
213
+ return answers
214
+
215
+
216
+ def generate_questions_mcq(keyword_sent_mapping,device,tokenizer,model,sense2vec,normalized_levenshtein):
217
+ batch_text = []
218
+ answers = keyword_sent_mapping.keys()
219
+ for answer in answers:
220
+ txt = keyword_sent_mapping[answer]
221
+ context = "context: " + txt
222
+ text = context + " " + "answer: " + answer + " </s>"
223
+ batch_text.append(text)
224
+
225
+ encoding = tokenizer.batch_encode_plus(batch_text, pad_to_max_length=True, return_tensors="pt")
226
+
227
+
228
+ print ("Running model for generation")
229
+ input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
230
+
231
+ with torch.no_grad():
232
+ outs = model.generate(input_ids=input_ids,
233
+ attention_mask=attention_masks,
234
+ max_length=150)
235
+
236
+ output_array ={}
237
+ output_array["questions"] =[]
238
+ # print(outs)
239
+ for index, val in enumerate(answers):
240
+ individual_question ={}
241
+ out = outs[index, :]
242
+ dec = tokenizer.decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True)
243
+
244
+ Question = dec.replace("question:", "")
245
+ Question = Question.strip()
246
+ individual_question["question_statement"] = Question
247
+ individual_question["question_type"] = "MCQ"
248
+ individual_question["answer"] = val
249
+ individual_question["id"] = index+1
250
+ individual_question["options"], individual_question["options_algorithm"] = get_options(val, sense2vec)
251
+
252
+ individual_question["options"] = filter_phrases(individual_question["options"], 10,normalized_levenshtein)
253
+ index = 3
254
+ individual_question["extra_options"]= individual_question["options"][index:]
255
+ individual_question["options"] = individual_question["options"][:index]
256
+ individual_question["context"] = keyword_sent_mapping[val]
257
+
258
+ if len(individual_question["options"])>0:
259
+ output_array["questions"].append(individual_question)
260
+
261
+ return output_array
262
+
263
+ def generate_normal_questions(keyword_sent_mapping,device,tokenizer,model): #for normal one word questions
264
+ batch_text = []
265
+ answers = keyword_sent_mapping.keys()
266
+ for answer in answers:
267
+ txt = keyword_sent_mapping[answer]
268
+ context = "context: " + txt
269
+ text = context + " " + "answer: " + answer + " </s>"
270
+ batch_text.append(text)
271
+
272
+ encoding = tokenizer.batch_encode_plus(batch_text, pad_to_max_length=True, return_tensors="pt")
273
+
274
+
275
+ print ("Running model for generation")
276
+ input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
277
+
278
+ with torch.no_grad():
279
+ outs = model.generate(input_ids=input_ids,
280
+ attention_mask=attention_masks,
281
+ max_length=150)
282
+
283
+ output_array ={}
284
+ output_array["questions"] =[]
285
+
286
+ for index, val in enumerate(answers):
287
+ individual_quest= {}
288
+ out = outs[index, :]
289
+ dec = tokenizer.decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True)
290
+
291
+ Question= dec.replace('question:', '')
292
+ Question= Question.strip()
293
+
294
+ individual_quest['Question']= Question
295
+ individual_quest['Answer']= val
296
+ individual_quest["id"] = index+1
297
+ individual_quest["context"] = keyword_sent_mapping[val]
298
+
299
+ output_array["questions"].append(individual_quest)
300
+
301
+ return output_array
302
+
303
+ def random_choice():
304
+ a = random.choice([0,1])
305
+ return bool(a)