Rehman1603 commited on
Commit
2d62082
1 Parent(s): e84a10b

Create main.py

Browse files
Files changed (1) hide show
  1. main.py +201 -0
main.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np # linear algebra
2
+ import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
3
+ import time
4
+ import torch
5
+ from transformers import T5ForConditionalGeneration,T5Tokenizer
6
+ import random
7
+ import spacy
8
+ import zipfile
9
+ import os
10
+ import json
11
+ from sense2vec import Sense2Vec
12
+ import requests
13
+ from collections import OrderedDict
14
+ import string
15
+ import pke
16
+ import nltk
17
+ import numpy
18
+ from nltk import FreqDist
19
+ nltk.download('brown', quiet=True, force=True)
20
+ nltk.download('stopwords', quiet=True, force=True)
21
+ nltk.download('popular', quiet=True, force=True)
22
+ from nltk.corpus import stopwords
23
+ from nltk.corpus import brown
24
+ from similarity.normalized_levenshtein import NormalizedLevenshtein
25
+ from nltk.tokenize import sent_tokenize
26
+ from flashtext import KeywordProcessor
27
+ from encoding import beam_search_decoding
28
+ from mcq import tokenize_sentences
29
+ from mcq import get_keywords
30
+ from mcq import get_sentences_for_keyword
31
+ from mcq import generate_questions_mcq
32
+ from mcq import generate_normal_questions
33
+ import time
34
+
35
+
36
+ os.system('!pip install git+https://github.com/boudinfl/pke.git')
37
+ os.system('!python -m nltk.downloader universal_tagset')
38
+ os.system('!python -m spacy download en')
39
+ os.system('!wget https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz')
40
+ os.system('!tar -xvf s2v_reddit_2015_md.tar.gz')
41
+
42
+ tokenizer = T5Tokenizer.from_pretrained('t5-large')
43
+ model = T5ForConditionalGeneration.from_pretrained('Parth/result')
44
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
45
+ model.to(device)
46
+ # model.eval()
47
+ device = device
48
+ model = model
49
+ nlp = spacy.load('en_core_web_sm')
50
+ s2v = Sense2Vec().from_disk('s2v_old')
51
+ fdist = FreqDist(brown.words())
52
+ normalized_levenshtein = NormalizedLevenshtein()
53
+ def set_seed(seed):
54
+ numpy.random.seed(seed)
55
+ torch.manual_seed(seed)
56
+ if torch.cuda.is_available():
57
+ torch.cuda.manual_seed_all(seed)
58
+ set_seed(42)
59
+
60
+
61
+
62
+ def predict_mcq(payload):
63
+ start = time.time()
64
+ inp = {
65
+ "input_text": payload.get("input_text"),
66
+ "max_questions": payload.get("max_questions", 4)
67
+ }
68
+
69
+ text = inp['input_text']
70
+ sentences = tokenize_sentences(text)
71
+ joiner = " "
72
+ modified_text = joiner.join(sentences)
73
+
74
+
75
+ keywords = get_keywords(nlp,modified_text,inp['max_questions'],s2v,fdist,normalized_levenshtein,len(sentences) )
76
+
77
+
78
+ keyword_sentence_mapping = get_sentences_for_keyword(keywords, sentences)
79
+
80
+ for k in keyword_sentence_mapping.keys():
81
+ text_snippet = " ".join(keyword_sentence_mapping[k][:3])
82
+ keyword_sentence_mapping[k] = text_snippet
83
+
84
+
85
+ final_output = {}
86
+
87
+ if len(keyword_sentence_mapping.keys()) == 0:
88
+ return final_output
89
+ else:
90
+ try:
91
+ generated_questions = generate_questions_mcq(keyword_sentence_mapping,device,tokenizer,model,s2v,normalized_levenshtein)
92
+
93
+ except:
94
+ return final_output
95
+ end = time.time()
96
+
97
+ final_output["statement"] = modified_text
98
+ final_output["questions"] = generated_questions["questions"]
99
+ final_output["time_taken"] = end-start
100
+
101
+ if torch.device=='cuda':
102
+ torch.cuda.empty_cache()
103
+
104
+ return final_output
105
+
106
+
107
+
108
+ def predict_shortq(payload):
109
+ inp = {
110
+ "input_text": payload.get("input_text"),
111
+ "max_questions": payload.get("max_questions", 4)
112
+ }
113
+
114
+ text = inp['input_text']
115
+ sentences = tokenize_sentences(text)
116
+ joiner = " "
117
+ modified_text = joiner.join(sentences)
118
+
119
+
120
+ keywords = get_keywords(nlp,modified_text,inp['max_questions'],s2v,fdist,normalized_levenshtein,len(sentences) )
121
+
122
+
123
+ keyword_sentence_mapping = get_sentences_for_keyword(keywords, sentences)
124
+
125
+ for k in keyword_sentence_mapping.keys():
126
+ text_snippet = " ".join(keyword_sentence_mapping[k][:3])
127
+ keyword_sentence_mapping[k] = text_snippet
128
+
129
+ final_output = {}
130
+
131
+ if len(keyword_sentence_mapping.keys()) == 0:
132
+ print('ZERO')
133
+ return final_output
134
+ else:
135
+
136
+ generated_questions = generate_normal_questions(keyword_sentence_mapping,device,tokenizer,model)
137
+ print(generated_questions)
138
+
139
+
140
+ final_output["statement"] = modified_text
141
+ final_output["questions"] = generated_questions["questions"]
142
+
143
+ if torch.device=='cuda':
144
+ torch.cuda.empty_cache()
145
+
146
+ return final_output
147
+
148
+
149
+
150
+
151
+
152
+ def paraphrase(payload):
153
+ start = time.time()
154
+ inp = {
155
+ "input_text": payload.get("input_text"),
156
+ "max_questions": payload.get("max_questions", 3)
157
+ }
158
+
159
+ text = inp['input_text']
160
+ num = inp['max_questions']
161
+
162
+ sentence= text
163
+ text= "paraphrase: " + sentence + " </s>"
164
+
165
+ encoding = tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt")
166
+ input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
167
+
168
+ beam_outputs = model.generate(
169
+ input_ids=input_ids,
170
+ attention_mask=attention_masks,
171
+ max_length= 50,
172
+ num_beams=50,
173
+ num_return_sequences=num,
174
+ no_repeat_ngram_size=2,
175
+ early_stopping=True
176
+ )
177
+
178
+ # print ("\nOriginal Question ::")
179
+ # print (text)
180
+ # print ("\n")
181
+ # print ("Paraphrased Questions :: ")
182
+ final_outputs =[]
183
+ for beam_output in beam_outputs:
184
+ sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
185
+ if sent.lower() != sentence.lower() and sent not in final_outputs:
186
+ final_outputs.append(sent)
187
+
188
+ output= {}
189
+ output['Question']= text
190
+ output['Count']= num
191
+ output['Paraphrased Questions']= final_outputs
192
+
193
+ for i, final_output in enumerate(final_outputs):
194
+ print("{}".format(i, final_output))
195
+
196
+ if torch.device=='cuda':
197
+ torch.cuda.empty_cache()
198
+
199
+ return output
200
+
201
+