File size: 6,232 Bytes
2d62082
 
 
 
 
 
058f46f
2d62082
 
abd2bf5
 
 
 
 
058f46f
ead68eb
2d62082
 
 
 
 
 
 
 
930e994
2d62082
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d7f748e
2d62082
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time
import torch
from transformers import T5ForConditionalGeneration,T5Tokenizer
import random
import spacy
import zipfile
import os
os.system('pip install git+https://github.com/boudinfl/pke.git')
os.system('python -m nltk.downloader universal_tagset')
os.system('python -m spacy download en')
os.system('wget https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz')
os.system('tar -xvf  s2v_reddit_2015_md.tar.gz')
os.system('python -m spacy download en_core_web_sm')
import git
import json
from sense2vec import Sense2Vec
import requests
from collections import OrderedDict
import string
import pke
import nltk
import numpy
import en_core_web_sm
from nltk import FreqDist
nltk.download('brown', quiet=True, force=True)
nltk.download('stopwords', quiet=True, force=True)
nltk.download('popular', quiet=True, force=True)
from nltk.corpus import stopwords
from nltk.corpus import brown
from similarity.normalized_levenshtein import NormalizedLevenshtein
from nltk.tokenize import sent_tokenize
from flashtext import KeywordProcessor
from encoding import beam_search_decoding
from mcq import tokenize_sentences
from mcq import get_keywords
from mcq import get_sentences_for_keyword
from mcq import generate_questions_mcq
from mcq import generate_normal_questions
import time




tokenizer = T5Tokenizer.from_pretrained('t5-large')
model = T5ForConditionalGeneration.from_pretrained('Parth/result')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# model.eval()
device = device
model = model
nlp = spacy.load('en_core_web_sm')
s2v = Sense2Vec().from_disk('s2v_old')
fdist = FreqDist(brown.words())
normalized_levenshtein = NormalizedLevenshtein()
def set_seed(seed):
        numpy.random.seed(seed)
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed)
set_seed(42)



def predict_mcq(payload):
        start = time.time()
        inp = {
            "input_text": payload.get("input_text"),
            "max_questions": payload.get("max_questions", 4)
        }

        text = inp['input_text']
        sentences = tokenize_sentences(text)
        joiner = " "
        modified_text = joiner.join(sentences)


        keywords = get_keywords(nlp,modified_text,inp['max_questions'],s2v,fdist,normalized_levenshtein,len(sentences) )


        keyword_sentence_mapping = get_sentences_for_keyword(keywords, sentences)

        for k in keyword_sentence_mapping.keys():
            text_snippet = " ".join(keyword_sentence_mapping[k][:3])
            keyword_sentence_mapping[k] = text_snippet


        final_output = {}

        if len(keyword_sentence_mapping.keys()) == 0:
            return final_output
        else:
            try:
                generated_questions = generate_questions_mcq(keyword_sentence_mapping,device,tokenizer,model,s2v,normalized_levenshtein)

            except:
                return final_output
            end = time.time()

            final_output["statement"] = modified_text
            final_output["questions"] = generated_questions["questions"]
            final_output["time_taken"] = end-start

            if torch.device=='cuda':
                torch.cuda.empty_cache()

            return final_output



def predict_shortq(payload):
        inp = {
            "input_text": payload.get("input_text"),
            "max_questions": payload.get("max_questions", 4)
        }

        text = inp['input_text']
        sentences = tokenize_sentences(text)
        joiner = " "
        modified_text = joiner.join(sentences)


        keywords = get_keywords(nlp,modified_text,inp['max_questions'],s2v,fdist,normalized_levenshtein,len(sentences) )


        keyword_sentence_mapping = get_sentences_for_keyword(keywords, sentences)

        for k in keyword_sentence_mapping.keys():
            text_snippet = " ".join(keyword_sentence_mapping[k][:3])
            keyword_sentence_mapping[k] = text_snippet

        final_output = {}

        if len(keyword_sentence_mapping.keys()) == 0:
            print('ZERO')
            return final_output
        else:

            generated_questions = generate_normal_questions(keyword_sentence_mapping,device,tokenizer,model)
            print(generated_questions)


        final_output["statement"] = modified_text
        final_output["questions"] = generated_questions["questions"]

        if torch.device=='cuda':
            torch.cuda.empty_cache()

        return final_output





def paraphrase(payload):
        start = time.time()
        inp = {
            "input_text": payload.get("input_text"),
            "max_questions": payload.get("max_questions", 3)
        }

        text = inp['input_text']
        num = inp['max_questions']

        sentence= text
        text= "paraphrase: " + sentence + " </s>"

        encoding = tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt")
        input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

        beam_outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_masks,
            max_length= 50,
            num_beams=50,
            num_return_sequences=num,
            no_repeat_ngram_size=2,
            early_stopping=True
            )

#         print ("\nOriginal Question ::")
#         print (text)
#         print ("\n")
#         print ("Paraphrased Questions :: ")
        final_outputs =[]
        for beam_output in beam_outputs:
            sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
            if sent.lower() != sentence.lower() and sent not in final_outputs:
                final_outputs.append(sent)

        output= {}
        output['Question']= text
        output['Count']= num
        output['Paraphrased Questions']= final_outputs

        for i, final_output in enumerate(final_outputs):
            print("{}".format(i, final_output))

        if torch.device=='cuda':
            torch.cuda.empty_cache()

        return output