DevBM commited on
Commit
2cbf0f3
1 Parent(s): dda9416

Update Questgen/main2.py

Browse files
Files changed (1) hide show
  1. Questgen/main2.py +482 -531
Questgen/main2.py CHANGED
@@ -1,531 +1,482 @@
1
- import numpy as np # linear algebra
2
- import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
3
- import time
4
- import torch
5
- from transformers import T5ForConditionalGeneration,T5Tokenizer
6
- from sklearn.feature_extraction.text import TfidfVectorizer
7
- from sklearn.metrics.pairwise import cosine_similarity
8
- from transformers import pipeline
9
- import random
10
- import spacy
11
- import zipfile
12
- import os
13
- import json
14
- from sense2vec import Sense2Vec
15
- import requests
16
- from collections import OrderedDict
17
- import string
18
- import pke
19
- import nltk
20
- import numpy
21
- import yake
22
- from nltk import FreqDist
23
- nltk.download('brown', quiet=True, force=True)
24
- nltk.download('stopwords', quiet=True, force=True)
25
- nltk.download('popular', quiet=True, force=True)
26
- from nltk.corpus import stopwords
27
- from nltk.corpus import brown
28
- from similarity.normalized_levenshtein import NormalizedLevenshtein
29
- from nltk.tokenize import sent_tokenize
30
- from flashtext import KeywordProcessor
31
- # from Questgen.encoding.encoding import beam_search_decoding
32
- # from Questgen.mcq.mcq import tokenize_sentences
33
- # from Questgen.mcq.mcq import get_keywords
34
- # from Questgen.mcq.mcq import get_sentences_for_keyword
35
- # from Questgen.mcq.mcq import generate_questions_mcq
36
- # from Questgen.mcq.mcq import generate_normal_questions
37
- import time
38
- import numpy as np # linear algebra
39
- import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
40
- import time
41
- import torch
42
- from transformers import T5ForConditionalGeneration,T5Tokenizer
43
- import random
44
- import spacy
45
- import zipfile
46
- import os
47
- import json
48
- from sense2vec import Sense2Vec
49
- import requests
50
- from collections import OrderedDict
51
- import string
52
- import pke
53
- import nltk
54
- from nltk import FreqDist
55
- nltk.download('brown')
56
- nltk.download('stopwords')
57
- nltk.download('popular')
58
- from nltk.corpus import stopwords
59
- from nltk.corpus import brown
60
- # from similarity.normalized_levenshtein import NormalizedLevenshtein
61
- from nltk.tokenize import sent_tokenize
62
- # from flashtext import KeywordProcessor
63
-
64
- def beam_search_decoding (inp_ids,attn_mask,model,tokenizer):
65
- beam_output = model.generate(input_ids=inp_ids,
66
- attention_mask=attn_mask,
67
- max_length=256,
68
- num_beams=10,
69
- num_return_sequences=3,
70
- no_repeat_ngram_size=2,
71
- early_stopping=True
72
- )
73
- Questions = [tokenizer.decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True) for out in
74
- beam_output]
75
- return [Question.strip().capitalize() for Question in Questions]
76
-
77
-
78
-
79
- def MCQs_available(word,s2v):
80
- word = word.replace(" ", "_")
81
- sense = s2v.get_best_sense(word)
82
- if sense is not None:
83
- return True
84
- else:
85
- return False
86
-
87
-
88
- def edits(word):
89
- "All edits that are one edit away from `word`."
90
- letters = 'abcdefghijklmnopqrstuvwxyz '+string.punctuation
91
- splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
92
- deletes = [L + R[1:] for L, R in splits if R]
93
- transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
94
- replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
95
- inserts = [L + c + R for L, R in splits for c in letters]
96
- return set(deletes + transposes + replaces + inserts)
97
-
98
-
99
- def sense2vec_get_words(word,s2v):
100
- output = []
101
-
102
- word_preprocessed = word.translate(word.maketrans("","", string.punctuation))
103
- word_preprocessed = word_preprocessed.lower()
104
-
105
- word_edits = edits(word_preprocessed)
106
-
107
- word = word.replace(" ", "_")
108
-
109
- sense = s2v.get_best_sense(word)
110
- most_similar = s2v.most_similar(sense, n=15)
111
-
112
- compare_list = [word_preprocessed]
113
- for each_word in most_similar:
114
- append_word = each_word[0].split("|")[0].replace("_", " ")
115
- append_word = append_word.strip()
116
- append_word_processed = append_word.lower()
117
- append_word_processed = append_word_processed.translate(append_word_processed.maketrans("","", string.punctuation))
118
- if append_word_processed not in compare_list and word_preprocessed not in append_word_processed and append_word_processed not in word_edits:
119
- output.append(append_word.title())
120
- compare_list.append(append_word_processed)
121
-
122
-
123
- out = list(OrderedDict.fromkeys(output))
124
-
125
- return out
126
-
127
- def get_options(answer,s2v):
128
- distractors =[]
129
-
130
- try:
131
- distractors = sense2vec_get_words(answer,s2v)
132
- if len(distractors) > 0:
133
- print(" Sense2vec_distractors successful for word : ", answer)
134
- return distractors,"sense2vec"
135
- except:
136
- print (" Sense2vec_distractors failed for word : ",answer)
137
-
138
-
139
- return distractors,"None"
140
-
141
- def tokenize_sentences(text):
142
- sentences = [sent_tokenize(text)]
143
- sentences = [y for x in sentences for y in x]
144
- # Remove any short sentences less than 20 letters.
145
- sentences = [sentence.strip() for sentence in sentences if len(sentence) > 5]
146
- return sentences
147
-
148
-
149
- def get_sentences_for_keyword(keywords, sentences):
150
- keyword_processor = KeywordProcessor()
151
- keyword_sentences = {}
152
- for word in keywords:
153
- word = word.strip()
154
- keyword_sentences[word] = []
155
- keyword_processor.add_keyword(word)
156
- for sentence in sentences:
157
- keywords_found = keyword_processor.extract_keywords(sentence)
158
- for key in keywords_found:
159
- keyword_sentences[key].append(sentence)
160
-
161
- for key in keyword_sentences.keys():
162
- values = keyword_sentences[key]
163
- values = sorted(values, key=len, reverse=True)
164
- keyword_sentences[key] = values
165
-
166
- delete_keys = []
167
- for k in keyword_sentences.keys():
168
- if len(keyword_sentences[k]) == 0:
169
- delete_keys.append(k)
170
- for del_key in delete_keys:
171
- del keyword_sentences[del_key]
172
- print(keyword_sentences)
173
- return keyword_sentences
174
-
175
-
176
- def is_far(words_list,currentword,thresh,normalized_levenshtein):
177
- threshold = thresh
178
- score_list =[]
179
- for word in words_list:
180
- score_list.append(normalized_levenshtein.distance(word.lower(),currentword.lower()))
181
- if min(score_list)>=threshold:
182
- return True
183
- else:
184
- return False
185
-
186
- def filter_phrases(phrase_keys,max,normalized_levenshtein ):
187
- filtered_phrases =[]
188
- if len(phrase_keys)>0:
189
- filtered_phrases.append(phrase_keys[0])
190
- for ph in phrase_keys[1:]:
191
- if is_far(filtered_phrases,ph,0.7,normalized_levenshtein ):
192
- filtered_phrases.append(ph)
193
- if len(filtered_phrases)>=max:
194
- break
195
- return filtered_phrases
196
-
197
-
198
- def get_nouns_multipartite(text):
199
- # out = []
200
-
201
- # extractor = pke.unsupervised.MultipartiteRank()
202
- # extractor.load_document(input=text, language='en')
203
- # pos = {'PROPN', 'NOUN'}
204
- # stoplist = list(string.punctuation)
205
- # stoplist += stopwords.words('english')
206
- # extractor.candidate_selection(pos=pos)
207
- # # 4. build the Multipartite graph and rank candidates using random walk,
208
- # # alpha controls the weight adjustment mechanism, see TopicRank for
209
- # # threshold/method parameters.
210
- # try:
211
- # extractor.candidate_weighting(alpha=1.1,
212
- # threshold=0.75,
213
- # method='average')
214
- # except:
215
- # return out
216
-
217
- # keyphrases = extractor.get_n_best(n=10)
218
-
219
- # for key in keyphrases:
220
- # out.append(key[0])
221
-
222
- # nlp = spacy.load("en_core_web_sm")
223
- # labels = nlp(text)
224
-
225
- # for i in (labels.ents):
226
- # out.append(str(i))
227
- nlp = spacy.load('en_core_web_sm')
228
- doc = nlp(text)
229
- ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")
230
- # Extract named entities using spaCy
231
- spacy_entities = [ent.text for ent in doc.ents]
232
- print(f"\n\nSpacy Entities: {spacy_entities}\n\n")
233
- # Extract named entities using BERT-based NER
234
- bert_entities = [entity['word'] for entity in ner_pipeline(text)]
235
- print(f"BERT Entities: {bert_entities}\n\n")
236
- # Combine both NER results and remove duplicates
237
- entities = list(set(spacy_entities))
238
-
239
- # Extract nouns and verbs using spaCy
240
- nouns = [chunk.text for chunk in doc.noun_chunks]
241
- verbs = [token.lemma_ for token in doc if token.pos_ == 'VERB']
242
- print(f"Spacy Nouns: {nouns}\n\n")
243
- print(f"Spacy Verbs: {verbs}\n\n")
244
-
245
- # Use YAKE for keyphrase extraction
246
- yake_extractor = yake.KeywordExtractor()
247
- yake_keywords = yake_extractor.extract_keywords(text)
248
- yake_keywords = [kw[0] for kw in yake_keywords]
249
- print(f"Yake: {yake_keywords}\n\n")
250
- # Combine all keywords and remove duplicates
251
- combined_keywords = list(set(entities + nouns + verbs + yake_keywords))
252
- vectorizer = TfidfVectorizer()
253
- tfidf_matrix = vectorizer.fit_transform(combined_keywords)
254
- similarity_matrix = cosine_similarity(tfidf_matrix)
255
- clusters = []
256
-
257
- similarity_threshold = 0.45
258
-
259
- for idx, word in enumerate(combined_keywords):
260
- added_to_cluster = False
261
- for cluster in clusters:
262
- # Check if the word is similar to any word in the existing cluster
263
- if any(similarity_matrix[idx, other_idx] > similarity_threshold for other_idx in cluster):
264
- cluster.append(idx)
265
- added_to_cluster = True
266
- break
267
- if not added_to_cluster:
268
- clusters.append([idx])
269
-
270
- # Step 4: Select representative words from each cluster
271
- representative_words = [combined_keywords[cluster[0]] for cluster in clusters]
272
-
273
- # Print the representative words
274
- print("Keywords after removing similar words: ", representative_words)
275
- # return combined_keywords
276
-
277
- return representative_words
278
-
279
-
280
- def get_phrases(doc):
281
- phrases={}
282
- for np in doc.noun_chunks:
283
- phrase =np.text
284
- len_phrase = len(phrase.split())
285
- if len_phrase > 1:
286
- if phrase not in phrases:
287
- phrases[phrase]=1
288
- else:
289
- phrases[phrase]=phrases[phrase]+1
290
-
291
- phrase_keys=list(phrases.keys())
292
- phrase_keys = sorted(phrase_keys, key= lambda x: len(x),reverse=True)
293
- phrase_keys=phrase_keys[:50]
294
- return phrase_keys
295
-
296
-
297
-
298
- def get_keywords(nlp,text,max_keywords,s2v,fdist,normalized_levenshtein,no_of_sentences):
299
- doc = nlp(text)
300
- max_keywords = int(max_keywords)
301
-
302
- keywords = get_nouns_multipartite(text)
303
- # keywords = sorted(keywords, key=lambda x: fdist[x])
304
- # keywords = filter_phrases(keywords, max_keywords,normalized_levenshtein )
305
-
306
- # phrase_keys = get_phrases(doc)
307
- # filtered_phrases = filter_phrases(phrase_keys, max_keywords,normalized_levenshtein )
308
-
309
- # total_phrases = keywords + filtered_phrases
310
-
311
- # total_phrases_filtered = filter_phrases(total_phrases, min(max_keywords, 2*no_of_sentences),normalized_levenshtein )
312
- total_phrases_filtered = keywords
313
-
314
-
315
- answers = []
316
- for answer in total_phrases_filtered:
317
- if answer not in answers and MCQs_available(answer,s2v):
318
- answers.append(answer)
319
-
320
- # answers = answers[:max_keywords]
321
- # answers = keywords
322
- return answers
323
-
324
- def generate_questions_mcq(keyword_sent_mapping, device, tokenizer, model, sense2vec, normalized_levenshtein):
325
- batch_text = []
326
- answers = list(keyword_sent_mapping.keys()) # Get all answers from the keys
327
-
328
- for answer in answers:
329
- value_list = keyword_sent_mapping[answer] # Get list of sentences for this answer
330
- for txt in value_list:
331
- text = "<context>\t" + txt + "\t<answer>\t" + answer
332
- batch_text.append(text)
333
-
334
- encoding = tokenizer.batch_encode_plus(batch_text, pad_to_max_length=True, return_tensors="pt")
335
-
336
- print("Running model for generation")
337
- input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
338
-
339
- with torch.no_grad():
340
- outs = model.generate(input_ids=input_ids,
341
- attention_mask=attention_masks,
342
- max_length=150)
343
-
344
- output_array = {"questions": []}
345
-
346
- for index, val in enumerate(answers):
347
- out = outs[index, :]
348
- dec = tokenizer.decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True)
349
-
350
- Question = dec.replace("question:", "")
351
- Question = Question.strip()
352
-
353
- individual_question = {
354
- "question_statement": Question,
355
- "question_type": "MCQ",
356
- "answer": val,
357
- "id": index + 1,
358
- "options": [],
359
- "options_algorithm": [],
360
- "extra_options": [],
361
- "context": keyword_sent_mapping[val] # Assuming keyword_sent_mapping is a dictionary of lists
362
- }
363
-
364
- # Get options and filter them
365
- individual_question["options"], individual_question["options_algorithm"] = get_options(val, sense2vec)
366
- individual_question["options"] = filter_phrases(individual_question["options"], 10, normalized_levenshtein)
367
-
368
- # Adjusting the number of options and extra options
369
- index = 3
370
- individual_question["extra_options"] = individual_question["options"][index:]
371
- individual_question["options"] = individual_question["options"][:index]
372
-
373
- if len(individual_question["options"]) > 0:
374
- output_array["questions"].append(individual_question)
375
-
376
- return output_array
377
-
378
-
379
-
380
- def generate_normal_questions(keyword_sent_mapping,device,tokenizer,model): #for normal one word questions
381
- batch_text = []
382
- answers = keyword_sent_mapping.keys()
383
- for answer in answers:
384
- txt = keyword_sent_mapping[answer]
385
- context = "context: " + txt
386
- text = context + " " + "answer: " + answer + " </s>"
387
- batch_text.append(text)
388
-
389
- encoding = tokenizer.batch_encode_plus(batch_text, pad_to_max_length=True, return_tensors="pt")
390
-
391
-
392
- print ("Running model for generation")
393
- input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
394
-
395
- with torch.no_grad():
396
- outs = model.generate(input_ids=input_ids,
397
- attention_mask=attention_masks,
398
- max_length=150)
399
-
400
- output_array ={}
401
- output_array["questions"] =[]
402
-
403
- for index, val in enumerate(answers):
404
- individual_quest= {}
405
- out = outs[index, :]
406
- dec = tokenizer.decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True)
407
-
408
- Question= dec.replace('question:', '')
409
- Question= Question.strip()
410
-
411
- individual_quest['Question']= Question
412
- individual_quest['Answer']= val
413
- individual_quest["id"] = index+1
414
- individual_quest["context"] = keyword_sent_mapping[val]
415
-
416
- output_array["questions"].append(individual_quest)
417
-
418
- return output_array
419
-
420
- def random_choice():
421
- a = random.choice([0,1])
422
- return bool(a)
423
-
424
- class QGen:
425
-
426
- def __init__(self):
427
-
428
- self.tokenizer = T5Tokenizer.from_pretrained('t5-large')
429
- model = T5ForConditionalGeneration.from_pretrained('DevBM/t5-large-squad')
430
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
431
- model.to(device)
432
- # model.eval()
433
- self.device = device
434
- self.model = model
435
- self.nlp = spacy.load('en_core_web_sm')
436
-
437
- self.s2v = Sense2Vec().from_disk('s2v_old')
438
-
439
- self.fdist = FreqDist(brown.words())
440
- self.normalized_levenshtein = NormalizedLevenshtein()
441
- self.set_seed(42)
442
-
443
- def set_seed(self,seed):
444
- numpy.random.seed(seed)
445
- torch.manual_seed(seed)
446
- if torch.cuda.is_available():
447
- torch.cuda.manual_seed_all(seed)
448
-
449
- def predict_mcq(self, payload):
450
- start = time.time()
451
- inp = {
452
- "input_text": payload.get("input_text"),
453
- "max_questions": payload.get("max_questions", 4)
454
- }
455
-
456
- text = inp['input_text']
457
- sentences = tokenize_sentences(text)
458
- joiner = " "
459
- modified_text = joiner.join(sentences)
460
-
461
-
462
- keywords = get_keywords(self.nlp,modified_text,inp['max_questions'],self.s2v,self.fdist,self.normalized_levenshtein,len(sentences) )
463
-
464
-
465
- keyword_sentence_mapping = get_sentences_for_keyword(keywords, sentences)
466
-
467
- # for k in keyword_sentence_mapping.keys():
468
- # text_snippet = " ".join(keyword_sentence_mapping[k][:3])
469
- # keyword_sentence_mapping[k] = text_snippet
470
-
471
-
472
- final_output = {}
473
-
474
- if len(keyword_sentence_mapping.keys()) == 0:
475
- return final_output
476
- else:
477
- try:
478
- generated_questions = generate_questions_mcq(keyword_sentence_mapping,self.device,self.tokenizer,self.model,self.s2v,self.normalized_levenshtein)
479
-
480
- except:
481
- return final_output
482
- end = time.time()
483
-
484
- final_output["statement"] = modified_text
485
- final_output["questions"] = generated_questions["questions"]
486
- final_output["time_taken"] = end-start
487
-
488
- if torch.device=='cuda':
489
- torch.cuda.empty_cache()
490
-
491
- return final_output
492
-
493
- def predict_shortq(self, payload):
494
- inp = {
495
- "input_text": payload.get("input_text"),
496
- "max_questions": payload.get("max_questions", 4)
497
- }
498
-
499
- text = inp['input_text']
500
- sentences = tokenize_sentences(text)
501
- joiner = " "
502
- modified_text = joiner.join(sentences)
503
-
504
-
505
- keywords = get_keywords(self.nlp,modified_text,inp['max_questions'],self.s2v,self.fdist,self.normalized_levenshtein,len(sentences) )
506
-
507
-
508
- keyword_sentence_mapping = get_sentences_for_keyword(keywords, sentences)
509
-
510
- for k in keyword_sentence_mapping.keys():
511
- text_snippet = " ".join(keyword_sentence_mapping[k][:3])
512
- keyword_sentence_mapping[k] = text_snippet
513
-
514
- final_output = {}
515
-
516
- if len(keyword_sentence_mapping.keys()) == 0:
517
- print('ZERO')
518
- return final_output
519
- else:
520
-
521
- generated_questions = generate_normal_questions(keyword_sentence_mapping,self.device,self.tokenizer,self.model)
522
- print(generated_questions)
523
-
524
-
525
- final_output["statement"] = modified_text
526
- final_output["questions"] = generated_questions["questions"]
527
-
528
- if torch.device=='cuda':
529
- torch.cuda.empty_cache()
530
-
531
- return final_output
 
1
+ import numpy as np # linear algebra
2
+ import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
3
+ import time
4
+ import torch
5
+ from transformers import T5ForConditionalGeneration,T5Tokenizer
6
+ from sklearn.feature_extraction.text import TfidfVectorizer
7
+ from sklearn.metrics.pairwise import cosine_similarity
8
+ from transformers import pipeline
9
+ import random
10
+ import spacy
11
+ import zipfile
12
+ import os
13
+ import json
14
+ from sense2vec import Sense2Vec
15
+ import requests
16
+ from collections import OrderedDict
17
+ import string
18
+ import pke
19
+ import nltk
20
+ import numpy
21
+ import yake
22
+ from nltk import FreqDist
23
+ nltk.download('brown', quiet=True, force=True)
24
+ nltk.download('stopwords', quiet=True, force=True)
25
+ nltk.download('popular', quiet=True, force=True)
26
+ from nltk.corpus import stopwords
27
+ from nltk.corpus import brown
28
+ from similarity.normalized_levenshtein import NormalizedLevenshtein
29
+ from nltk.tokenize import sent_tokenize
30
+ from flashtext import KeywordProcessor
31
+ import time
32
+ import numpy as np # linear algebra
33
+ import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
34
+ import time
35
+ import torch
36
+ from transformers import T5ForConditionalGeneration,T5Tokenizer
37
+ import random
38
+ import spacy
39
+ import zipfile
40
+ import os
41
+ import json
42
+ from sense2vec import Sense2Vec
43
+ import requests
44
+ from collections import OrderedDict
45
+ import string
46
+ import pke
47
+ import nltk
48
+ from nltk import FreqDist
49
+ nltk.download('brown')
50
+ nltk.download('stopwords')
51
+ nltk.download('popular')
52
+ from nltk.corpus import stopwords
53
+ from nltk.corpus import brown
54
+ # from similarity.normalized_levenshtein import NormalizedLevenshtein
55
+ from nltk.tokenize import sent_tokenize
56
+ # from flashtext import KeywordProcessor
57
+
58
+ def beam_search_decoding (inp_ids,attn_mask,model,tokenizer):
59
+ beam_output = model.generate(input_ids=inp_ids,
60
+ attention_mask=attn_mask,
61
+ max_length=256,
62
+ num_beams=10,
63
+ num_return_sequences=3,
64
+ no_repeat_ngram_size=2,
65
+ early_stopping=True
66
+ )
67
+ Questions = [tokenizer.decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True) for out in
68
+ beam_output]
69
+ return [Question.strip().capitalize() for Question in Questions]
70
+
71
+
72
+
73
+ def MCQs_available(word,s2v):
74
+ word = word.replace(" ", "_")
75
+ sense = s2v.get_best_sense(word)
76
+ if sense is not None:
77
+ return True
78
+ else:
79
+ return False
80
+
81
+
82
+ def edits(word):
83
+ "All edits that are one edit away from `word`."
84
+ letters = 'abcdefghijklmnopqrstuvwxyz '+string.punctuation
85
+ splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
86
+ deletes = [L + R[1:] for L, R in splits if R]
87
+ transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
88
+ replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
89
+ inserts = [L + c + R for L, R in splits for c in letters]
90
+ return set(deletes + transposes + replaces + inserts)
91
+
92
+
93
+ def sense2vec_get_words(word,s2v):
94
+ output = []
95
+
96
+ word_preprocessed = word.translate(word.maketrans("","", string.punctuation))
97
+ word_preprocessed = word_preprocessed.lower()
98
+
99
+ word_edits = edits(word_preprocessed)
100
+
101
+ word = word.replace(" ", "_")
102
+
103
+ sense = s2v.get_best_sense(word)
104
+ most_similar = s2v.most_similar(sense, n=15)
105
+
106
+ compare_list = [word_preprocessed]
107
+ for each_word in most_similar:
108
+ append_word = each_word[0].split("|")[0].replace("_", " ")
109
+ append_word = append_word.strip()
110
+ append_word_processed = append_word.lower()
111
+ append_word_processed = append_word_processed.translate(append_word_processed.maketrans("","", string.punctuation))
112
+ if append_word_processed not in compare_list and word_preprocessed not in append_word_processed and append_word_processed not in word_edits:
113
+ output.append(append_word.title())
114
+ compare_list.append(append_word_processed)
115
+
116
+
117
+ out = list(OrderedDict.fromkeys(output))
118
+
119
+ return out
120
+
121
+ def get_options(answer,s2v):
122
+ distractors =[]
123
+
124
+ try:
125
+ distractors = sense2vec_get_words(answer,s2v)
126
+ if len(distractors) > 0:
127
+ print(" Sense2vec_distractors successful for word : ", answer)
128
+ return distractors,"sense2vec"
129
+ except:
130
+ print (" Sense2vec_distractors failed for word : ",answer)
131
+
132
+
133
+ return distractors,"None"
134
+
135
+ def tokenize_sentences(text):
136
+ sentences = [sent_tokenize(text)]
137
+ sentences = [y for x in sentences for y in x]
138
+ # Remove any short sentences less than 20 letters.
139
+ sentences = [sentence.strip() for sentence in sentences if len(sentence) > 5]
140
+ return sentences
141
+
142
+
143
+ def get_sentences_for_keyword(keywords, sentences):
144
+ keyword_processor = KeywordProcessor()
145
+ keyword_sentences = {}
146
+ for word in keywords:
147
+ word = word.strip()
148
+ keyword_sentences[word] = []
149
+ keyword_processor.add_keyword(word)
150
+ for sentence in sentences:
151
+ keywords_found = keyword_processor.extract_keywords(sentence)
152
+ for key in keywords_found:
153
+ keyword_sentences[key].append(sentence)
154
+
155
+ for key in keyword_sentences.keys():
156
+ values = keyword_sentences[key]
157
+ values = sorted(values, key=len, reverse=True)
158
+ keyword_sentences[key] = values
159
+
160
+ delete_keys = []
161
+ for k in keyword_sentences.keys():
162
+ if len(keyword_sentences[k]) == 0:
163
+ delete_keys.append(k)
164
+ for del_key in delete_keys:
165
+ del keyword_sentences[del_key]
166
+ print(keyword_sentences)
167
+ return keyword_sentences
168
+
169
+
170
+ def is_far(words_list,currentword,thresh,normalized_levenshtein):
171
+ threshold = thresh
172
+ score_list =[]
173
+ for word in words_list:
174
+ score_list.append(normalized_levenshtein.distance(word.lower(),currentword.lower()))
175
+ if min(score_list)>=threshold:
176
+ return True
177
+ else:
178
+ return False
179
+
180
+ def filter_phrases(phrase_keys,max,normalized_levenshtein ):
181
+ filtered_phrases =[]
182
+ if len(phrase_keys)>0:
183
+ filtered_phrases.append(phrase_keys[0])
184
+ for ph in phrase_keys[1:]:
185
+ if is_far(filtered_phrases,ph,0.7,normalized_levenshtein ):
186
+ filtered_phrases.append(ph)
187
+ if len(filtered_phrases)>=max:
188
+ break
189
+ return filtered_phrases
190
+
191
+
192
+ def get_nouns_multipartite(text):
193
+ # out = []
194
+
195
+ # extractor = pke.unsupervised.MultipartiteRank()
196
+ # extractor.load_document(input=text, language='en')
197
+ # pos = {'PROPN', 'NOUN'}
198
+ # stoplist = list(string.punctuation)
199
+ # stoplist += stopwords.words('english')
200
+ # extractor.candidate_selection(pos=pos)
201
+ # # 4. build the Multipartite graph and rank candidates using random walk,
202
+ # # alpha controls the weight adjustment mechanism, see TopicRank for
203
+ # # threshold/method parameters.
204
+ # try:
205
+ # extractor.candidate_weighting(alpha=1.1,
206
+ # threshold=0.75,
207
+ # method='average')
208
+ # except:
209
+ # return out
210
+
211
+ # keyphrases = extractor.get_n_best(n=10)
212
+
213
+ # for key in keyphrases:
214
+ # out.append(key[0])
215
+
216
+ # nlp = spacy.load("en_core_web_sm")
217
+ # labels = nlp(text)
218
+
219
+ # for i in (labels.ents):
220
+ # out.append(str(i))
221
+ nlp = spacy.load('en_core_web_sm')
222
+ doc = nlp(text)
223
+ # Extract named entities using spaCy
224
+ spacy_entities = [ent.text for ent in doc.ents]
225
+ print(f"\n\nSpacy Entities: {spacy_entities}\n\n")
226
+ # Combine both NER results and remove duplicates
227
+ entities = list(set(spacy_entities))
228
+
229
+ # Extract nouns and verbs using spaCy
230
+ nouns = [chunk.text for chunk in doc.noun_chunks]
231
+ verbs = [token.lemma_ for token in doc if token.pos_ == 'VERB']
232
+ print(f"Spacy Nouns: {nouns}\n\n")
233
+ print(f"Spacy Verbs: {verbs}\n\n")
234
+
235
+ # Use YAKE for keyphrase extraction
236
+ yake_extractor = yake.KeywordExtractor()
237
+ yake_keywords = yake_extractor.extract_keywords(text)
238
+ yake_keywords = [kw[0] for kw in yake_keywords]
239
+ print(f"Yake: {yake_keywords}\n\n")
240
+ # Combine all keywords and remove duplicates
241
+ combined_keywords = list(set(entities + nouns + verbs + yake_keywords))
242
+ vectorizer = TfidfVectorizer()
243
+ tfidf_matrix = vectorizer.fit_transform(combined_keywords)
244
+ similarity_matrix = cosine_similarity(tfidf_matrix)
245
+ clusters = []
246
+
247
+ similarity_threshold = 0.45
248
+
249
+ for idx, word in enumerate(combined_keywords):
250
+ added_to_cluster = False
251
+ for cluster in clusters:
252
+ # Check if the word is similar to any word in the existing cluster
253
+ if any(similarity_matrix[idx, other_idx] > similarity_threshold for other_idx in cluster):
254
+ cluster.append(idx)
255
+ added_to_cluster = True
256
+ break
257
+ if not added_to_cluster:
258
+ clusters.append([idx])
259
+
260
+ # Step 4: Select representative words from each cluster
261
+ representative_words = [combined_keywords[cluster[0]] for cluster in clusters]
262
+
263
+ # Print the representative words
264
+ print("Keywords after removing similar words: ", representative_words)
265
+ # return combined_keywords
266
+
267
+ return representative_words
268
+
269
+
270
+ def get_phrases(doc):
271
+ phrases={}
272
+ for np in doc.noun_chunks:
273
+ phrase =np.text
274
+ len_phrase = len(phrase.split())
275
+ if len_phrase > 1:
276
+ if phrase not in phrases:
277
+ phrases[phrase]=1
278
+ else:
279
+ phrases[phrase]=phrases[phrase]+1
280
+
281
+ phrase_keys=list(phrases.keys())
282
+ phrase_keys = sorted(phrase_keys, key= lambda x: len(x),reverse=True)
283
+ phrase_keys=phrase_keys[:50]
284
+ return phrase_keys
285
+
286
+
287
+
288
+ def get_keywords(nlp,text,max_keywords,s2v,fdist,normalized_levenshtein,no_of_sentences):
289
+ doc = nlp(text)
290
+ max_keywords = int(max_keywords)
291
+
292
+ keywords = get_nouns_multipartite(text)
293
+ # keywords = sorted(keywords, key=lambda x: fdist[x])
294
+ # keywords = filter_phrases(keywords, max_keywords,normalized_levenshtein )
295
+
296
+ # phrase_keys = get_phrases(doc)
297
+ # filtered_phrases = filter_phrases(phrase_keys, max_keywords,normalized_levenshtein )
298
+
299
+ # total_phrases = keywords + filtered_phrases
300
+
301
+ # total_phrases_filtered = filter_phrases(total_phrases, min(max_keywords, 2*no_of_sentences),normalized_levenshtein )
302
+ total_phrases_filtered = keywords
303
+
304
+
305
+ answers = []
306
+ for answer in total_phrases_filtered:
307
+ if answer not in answers and MCQs_available(answer,s2v):
308
+ answers.append(answer)
309
+
310
+ # answers = answers[:max_keywords]
311
+ # answers = keywords
312
+ return answers
313
+
314
+ def generate_questions_mcq(keyword_sent_mapping, device, tokenizer, model, sense2vec, normalized_levenshtein):
315
+ batch_text = []
316
+ answers = list(keyword_sent_mapping.keys()) # Get all answers from the keys
317
+
318
+ for answer in answers:
319
+ value_list = keyword_sent_mapping[answer] # Get list of sentences for this answer
320
+ for txt in value_list:
321
+ text = "<context>\t" + txt + "\t<answer>\t" + answer
322
+ batch_text.append(text)
323
+
324
+ encoding = tokenizer.batch_encode_plus(batch_text, pad_to_max_length=True, return_tensors="pt")
325
+
326
+ print("Running model for generation")
327
+ input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
328
+
329
+ with torch.no_grad():
330
+ outs = model.generate(input_ids=input_ids,
331
+ attention_mask=attention_masks,
332
+ max_length=150)
333
+
334
+ output_array = {"questions": []}
335
+
336
+ for index, val in enumerate(answers):
337
+ out = outs[index, :]
338
+ dec = tokenizer.decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True)
339
+
340
+ Question = dec.replace("question:", "")
341
+ Question = Question.strip()
342
+
343
+ individual_question = {
344
+ "question_statement": Question,
345
+ "question_type": "MCQ",
346
+ "answer": val,
347
+ "id": index + 1,
348
+ "options": [],
349
+ "options_algorithm": [],
350
+ "extra_options": [],
351
+ "context": keyword_sent_mapping[val] # Assuming keyword_sent_mapping is a dictionary of lists
352
+ }
353
+
354
+ # Get options and filter them
355
+ individual_question["options"], individual_question["options_algorithm"] = get_options(val, sense2vec)
356
+ individual_question["options"] = filter_phrases(individual_question["options"], 10, normalized_levenshtein)
357
+
358
+ # Adjusting the number of options and extra options
359
+ index = 3
360
+ individual_question["extra_options"] = individual_question["options"][index:]
361
+ individual_question["options"] = individual_question["options"][:index]
362
+
363
+ if len(individual_question["options"]) > 0:
364
+ output_array["questions"].append(individual_question)
365
+
366
+ return output_array
367
+
368
+
369
+
370
+ def generate_normal_questions(keyword_sent_mapping,device,tokenizer,model): #for normal one word questions
371
+ batch_text = []
372
+ answers = keyword_sent_mapping.keys()
373
+ for answer in answers:
374
+ txt = keyword_sent_mapping[answer]
375
+ context = "context: " + txt
376
+ text = context + " " + "answer: " + answer + " </s>"
377
+ batch_text.append(text)
378
+
379
+ encoding = tokenizer.batch_encode_plus(batch_text, pad_to_max_length=True, return_tensors="pt")
380
+
381
+
382
+ print ("Running model for generation")
383
+ input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
384
+
385
+ with torch.no_grad():
386
+ outs = model.generate(input_ids=input_ids,
387
+ attention_mask=attention_masks,
388
+ max_length=150)
389
+
390
+ output_array ={}
391
+ output_array["questions"] =[]
392
+
393
+ for index, val in enumerate(answers):
394
+ individual_quest= {}
395
+ out = outs[index, :]
396
+ dec = tokenizer.decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True)
397
+
398
+ Question= dec.replace('question:', '')
399
+ Question= Question.strip()
400
+
401
+ individual_quest['Question']= Question
402
+ individual_quest['Answer']= val
403
+ individual_quest["id"] = index+1
404
+ individual_quest["context"] = keyword_sent_mapping[val]
405
+
406
+ output_array["questions"].append(individual_quest)
407
+
408
+ return output_array
409
+
410
+ def random_choice():
411
+ a = random.choice([0,1])
412
+ return bool(a)
413
+
414
+ class QGen:
415
+
416
+ def __init__(self):
417
+
418
+ self.tokenizer = T5Tokenizer.from_pretrained('t5-large')
419
+ model = T5ForConditionalGeneration.from_pretrained('DevBM/t5-large-squad')
420
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
421
+ model.to(device)
422
+ # model.eval()
423
+ self.device = device
424
+ self.model = model
425
+ self.nlp = spacy.load('en_core_web_sm')
426
+
427
+ self.s2v = Sense2Vec().from_disk('s2v_old')
428
+
429
+ self.fdist = FreqDist(brown.words())
430
+ self.normalized_levenshtein = NormalizedLevenshtein()
431
+ self.set_seed(42)
432
+
433
+ def set_seed(self,seed):
434
+ numpy.random.seed(seed)
435
+ torch.manual_seed(seed)
436
+ if torch.cuda.is_available():
437
+ torch.cuda.manual_seed_all(seed)
438
+
439
+ def predict_mcq(self, payload):
440
+ start = time.time()
441
+ inp = {
442
+ "input_text": payload.get("input_text"),
443
+ "max_questions": payload.get("max_questions", 4)
444
+ }
445
+
446
+ text = inp['input_text']
447
+ sentences = tokenize_sentences(text)
448
+ joiner = " "
449
+ modified_text = joiner.join(sentences)
450
+
451
+
452
+ keywords = get_keywords(self.nlp,modified_text,inp['max_questions'],self.s2v,self.fdist,self.normalized_levenshtein,len(sentences) )
453
+
454
+
455
+ keyword_sentence_mapping = get_sentences_for_keyword(keywords, sentences)
456
+
457
+ # for k in keyword_sentence_mapping.keys():
458
+ # text_snippet = " ".join(keyword_sentence_mapping[k][:3])
459
+ # keyword_sentence_mapping[k] = text_snippet
460
+
461
+
462
+ final_output = {}
463
+
464
+ if len(keyword_sentence_mapping.keys()) == 0:
465
+ return final_output
466
+ else:
467
+ try:
468
+ generated_questions = generate_questions_mcq(keyword_sentence_mapping,self.device,self.tokenizer,self.model,self.s2v,self.normalized_levenshtein)
469
+
470
+ except:
471
+ return final_output
472
+ end = time.time()
473
+
474
+ final_output["statement"] = modified_text
475
+ final_output["questions"] = generated_questions["questions"]
476
+ final_output["time_taken"] = end-start
477
+
478
+ if torch.device=='cuda':
479
+ torch.cuda.empty_cache()
480
+
481
+ return final_output
482
+