File size: 6,469 Bytes
c392d0f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import nltk, pke, string, torch, requests, random
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from flashtext import KeywordProcessor

def postprocesstext(content):
    final=""
    for sent in sent_tokenize(content):
        sent = sent.capitalize()
        final = final +" "+sent
    return final

def summarizer(text,model,tokenizer):
    text = text.strip().replace("\n"," ")
    text = "summarize: "+text
    # print (text)
    max_len = 512
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    encoding = tokenizer.encode_plus(text,max_length=max_len, pad_to_max_length=False,\
        truncation=True, return_tensors="pt").to(device)
    input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
    outs = model.generate(input_ids=input_ids,
                                  attention_mask=attention_mask,
                                  early_stopping=True,
                                  num_beams=3,
                                  num_return_sequences=1,
                                  no_repeat_ngram_size=2,
                                  min_length = 75,
                                  max_length=300)
    dec = [tokenizer.decode(ids,skip_special_tokens=True) for ids in outs]
    summary = dec[0]
    summary = postprocesstext(summary)
    summary= summary.strip()
    return summary

def get_nouns_multipartite(content):
    out=[]
    try:
        extractor = pke.unsupervised.MultipartiteRank()
        extractor.load_document(input=content)
        #    not contain punctuation marks or stopwords as candidates.
        pos = {'PROPN','NOUN'}
        #pos = {'PROPN','NOUN'}
        stoplist = list(string.punctuation)
        stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
        stoplist += stopwords.words('english')
        # extractor.candidate_selection(pos=pos, stoplist=stoplist)
        extractor.candidate_selection(pos=pos)
        # 4. build the Multipartite graph and rank candidates using random walk,
        #    alpha controls the weight adjustment mechanism, see TopicRank for
        #    threshold/method parameters.
        extractor.candidate_weighting(alpha=1.1,
                                      threshold=0.75,
                                      method='average')
        keyphrases = extractor.get_n_best(n=15)
        for val in keyphrases:
            out.append(val[0])
    except Exception as e:
        out = []
        #traceback.print_exc()
        print("EXCEPTION: {}".format(e))
    return out

def filter_overlap_words(l):
    nl = []
    for i in range(len(l)):
        temp_l = l[0:i]+l[i+1:]
        inside=False
        for j in temp_l:
            if l[i] not in j:
                if l[i] not in nl:
                    nl.append(l[i])
                    inside = True
            else:
                if inside:
                    nl.remove(l[i])
                break
    return nl    

def get_keywords(originaltext,summarytext):
    keywords = get_nouns_multipartite(originaltext)
    #print ("keywords unsummarized: ",keywords)
    keyword_processor = KeywordProcessor()
    for keyword in keywords:
        keyword_processor.add_keyword(keyword)
    keywords_found = keyword_processor.extract_keywords(summarytext)
    keywords_found = list(set(keywords_found))
    #print("keywords_found in summarized: ",keywords_found)
    important_keywords =[]
    for keyword in keywords:
        if keyword in keywords_found:
            important_keywords.append(keyword)
    ## find keywords which don't have common word .. 
    from copy import deepcopy
    imp_words = deepcopy(important_keywords)
    #imp_words = filter_overlap_words(important_keywords)
    imp_words = [str(i).title() for i in imp_words]
    return imp_words[:5]

def get_question(context,answer,model,tokenizer):
    text = "context: {} answer: {}".format(context,answer)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    encoding = tokenizer.encode_plus(text,max_length=384, pad_to_max_length=False,\
        truncation=True, return_tensors="pt").to(device)
    input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
    outs = model.generate(input_ids=input_ids,
                                  attention_mask=attention_mask,
                                  early_stopping=True,
                                  num_beams=5,
                                  num_return_sequences=1,
                                  no_repeat_ngram_size=2,
                                  max_length=72)
    dec = [tokenizer.decode(ids,skip_special_tokens=True) for ids in outs]
    Question = dec[0].replace("question:","")
    Question= Question.strip()
    return Question

def get_related_word(word):
    url = "https://api.datamuse.com/words"
    querystring = {"ml":word}
    responses = requests.request("GET", url, params=querystring)
    related_words = []
    count = 0
    responses = responses.json()
    for res in responses:
        if count >= 4:
            break
        if res["word"]!=word and res["word"]!="":
            related_words.append(res["word"])
            count += 1
    related_words = [str(i).title() for i in related_words]
    return related_words

def get_final_option_list(ans,other_options):
    option1 = ans
    option2,option3,option4 = "dummy","dummy","dummy"
    try:
        option2 = other_options[0]
    except:
        pass
    try:
        option3 = other_options[1]
    except:
        pass
    try:
        option4 = other_options[2]
    except:
        pass
    final_options = [option1,option2,option3,option4]
    random.shuffle(final_options)
    final_options = tuple(final_options)
    ans_index= 0
    for i in range(4):
        if final_options[i] == ans:
            ans_index = i
    return final_options, ans_index

def load_raw_text():
    return "Billy and Ron are brothers. Billy is 5 years old. Ron is 7 years old. One day their mom took them to the zoo. Billy wore his red cap, and Ron wore his blue cap. They had fun watching all the animals. Ron liked the monkeys the best. He wanted to stay and watch them some more, but Billy wanted to go see the elephants. Elephants were Billy’s favorite. Their mom said it was time to go see the elephants, and Ron was sad. But their mom said they could come back and see the monkeys again before they left the zoo. Billy and Ron had a great day at the zoo."