DevBM commited on
Commit
22d77cb
1 Parent(s): 9de55fe

Upload main2.py

Browse files
Files changed (1) hide show
  1. Questgen/main2.py +531 -0
Questgen/main2.py ADDED
@@ -0,0 +1,531 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np # linear algebra
2
+ import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
3
+ import time
4
+ import torch
5
+ from transformers import T5ForConditionalGeneration,T5Tokenizer
6
+ from sklearn.feature_extraction.text import TfidfVectorizer
7
+ from sklearn.metrics.pairwise import cosine_similarity
8
+ from transformers import pipeline
9
+ import random
10
+ import spacy
11
+ import zipfile
12
+ import os
13
+ import json
14
+ from sense2vec import Sense2Vec
15
+ import requests
16
+ from collections import OrderedDict
17
+ import string
18
+ import pke
19
+ import nltk
20
+ import numpy
21
+ import yake
22
+ from nltk import FreqDist
23
+ nltk.download('brown', quiet=True, force=True)
24
+ nltk.download('stopwords', quiet=True, force=True)
25
+ nltk.download('popular', quiet=True, force=True)
26
+ from nltk.corpus import stopwords
27
+ from nltk.corpus import brown
28
+ from similarity.normalized_levenshtein import NormalizedLevenshtein
29
+ from nltk.tokenize import sent_tokenize
30
+ from flashtext import KeywordProcessor
31
+ # from Questgen.encoding.encoding import beam_search_decoding
32
+ # from Questgen.mcq.mcq import tokenize_sentences
33
+ # from Questgen.mcq.mcq import get_keywords
34
+ # from Questgen.mcq.mcq import get_sentences_for_keyword
35
+ # from Questgen.mcq.mcq import generate_questions_mcq
36
+ # from Questgen.mcq.mcq import generate_normal_questions
37
+ import time
38
+ import numpy as np # linear algebra
39
+ import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
40
+ import time
41
+ import torch
42
+ from transformers import T5ForConditionalGeneration,T5Tokenizer
43
+ import random
44
+ import spacy
45
+ import zipfile
46
+ import os
47
+ import json
48
+ from sense2vec import Sense2Vec
49
+ import requests
50
+ from collections import OrderedDict
51
+ import string
52
+ import pke
53
+ import nltk
54
+ from nltk import FreqDist
55
+ nltk.download('brown')
56
+ nltk.download('stopwords')
57
+ nltk.download('popular')
58
+ from nltk.corpus import stopwords
59
+ from nltk.corpus import brown
60
+ # from similarity.normalized_levenshtein import NormalizedLevenshtein
61
+ from nltk.tokenize import sent_tokenize
62
+ # from flashtext import KeywordProcessor
63
+
64
+ def beam_search_decoding (inp_ids,attn_mask,model,tokenizer):
65
+ beam_output = model.generate(input_ids=inp_ids,
66
+ attention_mask=attn_mask,
67
+ max_length=256,
68
+ num_beams=10,
69
+ num_return_sequences=3,
70
+ no_repeat_ngram_size=2,
71
+ early_stopping=True
72
+ )
73
+ Questions = [tokenizer.decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True) for out in
74
+ beam_output]
75
+ return [Question.strip().capitalize() for Question in Questions]
76
+
77
+
78
+
79
+ def MCQs_available(word,s2v):
80
+ word = word.replace(" ", "_")
81
+ sense = s2v.get_best_sense(word)
82
+ if sense is not None:
83
+ return True
84
+ else:
85
+ return False
86
+
87
+
88
+ def edits(word):
89
+ "All edits that are one edit away from `word`."
90
+ letters = 'abcdefghijklmnopqrstuvwxyz '+string.punctuation
91
+ splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
92
+ deletes = [L + R[1:] for L, R in splits if R]
93
+ transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
94
+ replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
95
+ inserts = [L + c + R for L, R in splits for c in letters]
96
+ return set(deletes + transposes + replaces + inserts)
97
+
98
+
99
+ def sense2vec_get_words(word,s2v):
100
+ output = []
101
+
102
+ word_preprocessed = word.translate(word.maketrans("","", string.punctuation))
103
+ word_preprocessed = word_preprocessed.lower()
104
+
105
+ word_edits = edits(word_preprocessed)
106
+
107
+ word = word.replace(" ", "_")
108
+
109
+ sense = s2v.get_best_sense(word)
110
+ most_similar = s2v.most_similar(sense, n=15)
111
+
112
+ compare_list = [word_preprocessed]
113
+ for each_word in most_similar:
114
+ append_word = each_word[0].split("|")[0].replace("_", " ")
115
+ append_word = append_word.strip()
116
+ append_word_processed = append_word.lower()
117
+ append_word_processed = append_word_processed.translate(append_word_processed.maketrans("","", string.punctuation))
118
+ if append_word_processed not in compare_list and word_preprocessed not in append_word_processed and append_word_processed not in word_edits:
119
+ output.append(append_word.title())
120
+ compare_list.append(append_word_processed)
121
+
122
+
123
+ out = list(OrderedDict.fromkeys(output))
124
+
125
+ return out
126
+
127
+ def get_options(answer,s2v):
128
+ distractors =[]
129
+
130
+ try:
131
+ distractors = sense2vec_get_words(answer,s2v)
132
+ if len(distractors) > 0:
133
+ print(" Sense2vec_distractors successful for word : ", answer)
134
+ return distractors,"sense2vec"
135
+ except:
136
+ print (" Sense2vec_distractors failed for word : ",answer)
137
+
138
+
139
+ return distractors,"None"
140
+
141
+ def tokenize_sentences(text):
142
+ sentences = [sent_tokenize(text)]
143
+ sentences = [y for x in sentences for y in x]
144
+ # Remove any short sentences less than 20 letters.
145
+ sentences = [sentence.strip() for sentence in sentences if len(sentence) > 5]
146
+ return sentences
147
+
148
+
149
+ def get_sentences_for_keyword(keywords, sentences):
150
+ keyword_processor = KeywordProcessor()
151
+ keyword_sentences = {}
152
+ for word in keywords:
153
+ word = word.strip()
154
+ keyword_sentences[word] = []
155
+ keyword_processor.add_keyword(word)
156
+ for sentence in sentences:
157
+ keywords_found = keyword_processor.extract_keywords(sentence)
158
+ for key in keywords_found:
159
+ keyword_sentences[key].append(sentence)
160
+
161
+ for key in keyword_sentences.keys():
162
+ values = keyword_sentences[key]
163
+ values = sorted(values, key=len, reverse=True)
164
+ keyword_sentences[key] = values
165
+
166
+ delete_keys = []
167
+ for k in keyword_sentences.keys():
168
+ if len(keyword_sentences[k]) == 0:
169
+ delete_keys.append(k)
170
+ for del_key in delete_keys:
171
+ del keyword_sentences[del_key]
172
+ print(keyword_sentences)
173
+ return keyword_sentences
174
+
175
+
176
+ def is_far(words_list,currentword,thresh,normalized_levenshtein):
177
+ threshold = thresh
178
+ score_list =[]
179
+ for word in words_list:
180
+ score_list.append(normalized_levenshtein.distance(word.lower(),currentword.lower()))
181
+ if min(score_list)>=threshold:
182
+ return True
183
+ else:
184
+ return False
185
+
186
+ def filter_phrases(phrase_keys,max,normalized_levenshtein ):
187
+ filtered_phrases =[]
188
+ if len(phrase_keys)>0:
189
+ filtered_phrases.append(phrase_keys[0])
190
+ for ph in phrase_keys[1:]:
191
+ if is_far(filtered_phrases,ph,0.7,normalized_levenshtein ):
192
+ filtered_phrases.append(ph)
193
+ if len(filtered_phrases)>=max:
194
+ break
195
+ return filtered_phrases
196
+
197
+
198
+ def get_nouns_multipartite(text):
199
+ # out = []
200
+
201
+ # extractor = pke.unsupervised.MultipartiteRank()
202
+ # extractor.load_document(input=text, language='en')
203
+ # pos = {'PROPN', 'NOUN'}
204
+ # stoplist = list(string.punctuation)
205
+ # stoplist += stopwords.words('english')
206
+ # extractor.candidate_selection(pos=pos)
207
+ # # 4. build the Multipartite graph and rank candidates using random walk,
208
+ # # alpha controls the weight adjustment mechanism, see TopicRank for
209
+ # # threshold/method parameters.
210
+ # try:
211
+ # extractor.candidate_weighting(alpha=1.1,
212
+ # threshold=0.75,
213
+ # method='average')
214
+ # except:
215
+ # return out
216
+
217
+ # keyphrases = extractor.get_n_best(n=10)
218
+
219
+ # for key in keyphrases:
220
+ # out.append(key[0])
221
+
222
+ # nlp = spacy.load("en_core_web_sm")
223
+ # labels = nlp(text)
224
+
225
+ # for i in (labels.ents):
226
+ # out.append(str(i))
227
+ nlp = spacy.load('en_core_web_sm')
228
+ doc = nlp(text)
229
+ ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")
230
+ # Extract named entities using spaCy
231
+ spacy_entities = [ent.text for ent in doc.ents]
232
+ print(f"\n\nSpacy Entities: {spacy_entities}\n\n")
233
+ # Extract named entities using BERT-based NER
234
+ bert_entities = [entity['word'] for entity in ner_pipeline(text)]
235
+ print(f"BERT Entities: {bert_entities}\n\n")
236
+ # Combine both NER results and remove duplicates
237
+ entities = list(set(spacy_entities))
238
+
239
+ # Extract nouns and verbs using spaCy
240
+ nouns = [chunk.text for chunk in doc.noun_chunks]
241
+ verbs = [token.lemma_ for token in doc if token.pos_ == 'VERB']
242
+ print(f"Spacy Nouns: {nouns}\n\n")
243
+ print(f"Spacy Verbs: {verbs}\n\n")
244
+
245
+ # Use YAKE for keyphrase extraction
246
+ yake_extractor = yake.KeywordExtractor()
247
+ yake_keywords = yake_extractor.extract_keywords(text)
248
+ yake_keywords = [kw[0] for kw in yake_keywords]
249
+ print(f"Yake: {yake_keywords}\n\n")
250
+ # Combine all keywords and remove duplicates
251
+ combined_keywords = list(set(entities + nouns + verbs + yake_keywords))
252
+ vectorizer = TfidfVectorizer()
253
+ tfidf_matrix = vectorizer.fit_transform(combined_keywords)
254
+ similarity_matrix = cosine_similarity(tfidf_matrix)
255
+ clusters = []
256
+
257
+ similarity_threshold = 0.45
258
+
259
+ for idx, word in enumerate(combined_keywords):
260
+ added_to_cluster = False
261
+ for cluster in clusters:
262
+ # Check if the word is similar to any word in the existing cluster
263
+ if any(similarity_matrix[idx, other_idx] > similarity_threshold for other_idx in cluster):
264
+ cluster.append(idx)
265
+ added_to_cluster = True
266
+ break
267
+ if not added_to_cluster:
268
+ clusters.append([idx])
269
+
270
+ # Step 4: Select representative words from each cluster
271
+ representative_words = [combined_keywords[cluster[0]] for cluster in clusters]
272
+
273
+ # Print the representative words
274
+ print("Keywords after removing similar words: ", representative_words)
275
+ # return combined_keywords
276
+
277
+ return representative_words
278
+
279
+
280
+ def get_phrases(doc):
281
+ phrases={}
282
+ for np in doc.noun_chunks:
283
+ phrase =np.text
284
+ len_phrase = len(phrase.split())
285
+ if len_phrase > 1:
286
+ if phrase not in phrases:
287
+ phrases[phrase]=1
288
+ else:
289
+ phrases[phrase]=phrases[phrase]+1
290
+
291
+ phrase_keys=list(phrases.keys())
292
+ phrase_keys = sorted(phrase_keys, key= lambda x: len(x),reverse=True)
293
+ phrase_keys=phrase_keys[:50]
294
+ return phrase_keys
295
+
296
+
297
+
298
+ def get_keywords(nlp,text,max_keywords,s2v,fdist,normalized_levenshtein,no_of_sentences):
299
+ doc = nlp(text)
300
+ max_keywords = int(max_keywords)
301
+
302
+ keywords = get_nouns_multipartite(text)
303
+ # keywords = sorted(keywords, key=lambda x: fdist[x])
304
+ # keywords = filter_phrases(keywords, max_keywords,normalized_levenshtein )
305
+
306
+ # phrase_keys = get_phrases(doc)
307
+ # filtered_phrases = filter_phrases(phrase_keys, max_keywords,normalized_levenshtein )
308
+
309
+ # total_phrases = keywords + filtered_phrases
310
+
311
+ # total_phrases_filtered = filter_phrases(total_phrases, min(max_keywords, 2*no_of_sentences),normalized_levenshtein )
312
+ total_phrases_filtered = keywords
313
+
314
+
315
+ answers = []
316
+ for answer in total_phrases_filtered:
317
+ if answer not in answers and MCQs_available(answer,s2v):
318
+ answers.append(answer)
319
+
320
+ # answers = answers[:max_keywords]
321
+ # answers = keywords
322
+ return answers
323
+
324
+ def generate_questions_mcq(keyword_sent_mapping, device, tokenizer, model, sense2vec, normalized_levenshtein):
325
+ batch_text = []
326
+ answers = list(keyword_sent_mapping.keys()) # Get all answers from the keys
327
+
328
+ for answer in answers:
329
+ value_list = keyword_sent_mapping[answer] # Get list of sentences for this answer
330
+ for txt in value_list:
331
+ text = "<context>\t" + txt + "\t<answer>\t" + answer
332
+ batch_text.append(text)
333
+
334
+ encoding = tokenizer.batch_encode_plus(batch_text, pad_to_max_length=True, return_tensors="pt")
335
+
336
+ print("Running model for generation")
337
+ input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
338
+
339
+ with torch.no_grad():
340
+ outs = model.generate(input_ids=input_ids,
341
+ attention_mask=attention_masks,
342
+ max_length=150)
343
+
344
+ output_array = {"questions": []}
345
+
346
+ for index, val in enumerate(answers):
347
+ out = outs[index, :]
348
+ dec = tokenizer.decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True)
349
+
350
+ Question = dec.replace("question:", "")
351
+ Question = Question.strip()
352
+
353
+ individual_question = {
354
+ "question_statement": Question,
355
+ "question_type": "MCQ",
356
+ "answer": val,
357
+ "id": index + 1,
358
+ "options": [],
359
+ "options_algorithm": [],
360
+ "extra_options": [],
361
+ "context": keyword_sent_mapping[val] # Assuming keyword_sent_mapping is a dictionary of lists
362
+ }
363
+
364
+ # Get options and filter them
365
+ individual_question["options"], individual_question["options_algorithm"] = get_options(val, sense2vec)
366
+ individual_question["options"] = filter_phrases(individual_question["options"], 10, normalized_levenshtein)
367
+
368
+ # Adjusting the number of options and extra options
369
+ index = 3
370
+ individual_question["extra_options"] = individual_question["options"][index:]
371
+ individual_question["options"] = individual_question["options"][:index]
372
+
373
+ if len(individual_question["options"]) > 0:
374
+ output_array["questions"].append(individual_question)
375
+
376
+ return output_array
377
+
378
+
379
+
380
+ def generate_normal_questions(keyword_sent_mapping,device,tokenizer,model): #for normal one word questions
381
+ batch_text = []
382
+ answers = keyword_sent_mapping.keys()
383
+ for answer in answers:
384
+ txt = keyword_sent_mapping[answer]
385
+ context = "context: " + txt
386
+ text = context + " " + "answer: " + answer + " </s>"
387
+ batch_text.append(text)
388
+
389
+ encoding = tokenizer.batch_encode_plus(batch_text, pad_to_max_length=True, return_tensors="pt")
390
+
391
+
392
+ print ("Running model for generation")
393
+ input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
394
+
395
+ with torch.no_grad():
396
+ outs = model.generate(input_ids=input_ids,
397
+ attention_mask=attention_masks,
398
+ max_length=150)
399
+
400
+ output_array ={}
401
+ output_array["questions"] =[]
402
+
403
+ for index, val in enumerate(answers):
404
+ individual_quest= {}
405
+ out = outs[index, :]
406
+ dec = tokenizer.decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True)
407
+
408
+ Question= dec.replace('question:', '')
409
+ Question= Question.strip()
410
+
411
+ individual_quest['Question']= Question
412
+ individual_quest['Answer']= val
413
+ individual_quest["id"] = index+1
414
+ individual_quest["context"] = keyword_sent_mapping[val]
415
+
416
+ output_array["questions"].append(individual_quest)
417
+
418
+ return output_array
419
+
420
+ def random_choice():
421
+ a = random.choice([0,1])
422
+ return bool(a)
423
+
424
+ class QGen:
425
+
426
+ def __init__(self):
427
+
428
+ self.tokenizer = T5Tokenizer.from_pretrained('t5-large')
429
+ model = T5ForConditionalGeneration.from_pretrained('DevBM/t5-large-squad')
430
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
431
+ model.to(device)
432
+ # model.eval()
433
+ self.device = device
434
+ self.model = model
435
+ self.nlp = spacy.load('en_core_web_sm')
436
+
437
+ self.s2v = Sense2Vec().from_disk('s2v_old')
438
+
439
+ self.fdist = FreqDist(brown.words())
440
+ self.normalized_levenshtein = NormalizedLevenshtein()
441
+ self.set_seed(42)
442
+
443
+ def set_seed(self,seed):
444
+ numpy.random.seed(seed)
445
+ torch.manual_seed(seed)
446
+ if torch.cuda.is_available():
447
+ torch.cuda.manual_seed_all(seed)
448
+
449
+ def predict_mcq(self, payload):
450
+ start = time.time()
451
+ inp = {
452
+ "input_text": payload.get("input_text"),
453
+ "max_questions": payload.get("max_questions", 4)
454
+ }
455
+
456
+ text = inp['input_text']
457
+ sentences = tokenize_sentences(text)
458
+ joiner = " "
459
+ modified_text = joiner.join(sentences)
460
+
461
+
462
+ keywords = get_keywords(self.nlp,modified_text,inp['max_questions'],self.s2v,self.fdist,self.normalized_levenshtein,len(sentences) )
463
+
464
+
465
+ keyword_sentence_mapping = get_sentences_for_keyword(keywords, sentences)
466
+
467
+ # for k in keyword_sentence_mapping.keys():
468
+ # text_snippet = " ".join(keyword_sentence_mapping[k][:3])
469
+ # keyword_sentence_mapping[k] = text_snippet
470
+
471
+
472
+ final_output = {}
473
+
474
+ if len(keyword_sentence_mapping.keys()) == 0:
475
+ return final_output
476
+ else:
477
+ try:
478
+ generated_questions = generate_questions_mcq(keyword_sentence_mapping,self.device,self.tokenizer,self.model,self.s2v,self.normalized_levenshtein)
479
+
480
+ except:
481
+ return final_output
482
+ end = time.time()
483
+
484
+ final_output["statement"] = modified_text
485
+ final_output["questions"] = generated_questions["questions"]
486
+ final_output["time_taken"] = end-start
487
+
488
+ if torch.device=='cuda':
489
+ torch.cuda.empty_cache()
490
+
491
+ return final_output
492
+
493
+ def predict_shortq(self, payload):
494
+ inp = {
495
+ "input_text": payload.get("input_text"),
496
+ "max_questions": payload.get("max_questions", 4)
497
+ }
498
+
499
+ text = inp['input_text']
500
+ sentences = tokenize_sentences(text)
501
+ joiner = " "
502
+ modified_text = joiner.join(sentences)
503
+
504
+
505
+ keywords = get_keywords(self.nlp,modified_text,inp['max_questions'],self.s2v,self.fdist,self.normalized_levenshtein,len(sentences) )
506
+
507
+
508
+ keyword_sentence_mapping = get_sentences_for_keyword(keywords, sentences)
509
+
510
+ for k in keyword_sentence_mapping.keys():
511
+ text_snippet = " ".join(keyword_sentence_mapping[k][:3])
512
+ keyword_sentence_mapping[k] = text_snippet
513
+
514
+ final_output = {}
515
+
516
+ if len(keyword_sentence_mapping.keys()) == 0:
517
+ print('ZERO')
518
+ return final_output
519
+ else:
520
+
521
+ generated_questions = generate_normal_questions(keyword_sentence_mapping,self.device,self.tokenizer,self.model)
522
+ print(generated_questions)
523
+
524
+
525
+ final_output["statement"] = modified_text
526
+ final_output["questions"] = generated_questions["questions"]
527
+
528
+ if torch.device=='cuda':
529
+ torch.cuda.empty_cache()
530
+
531
+ return final_output