File size: 6,125 Bytes
32d9382
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
from TextSummarization import T5_Base

import spacy
import torch
from transformers import BertTokenizer, BertModel
from transformers import T5ForConditionalGeneration, T5Tokenizer, BertTokenizer, BertModel, AutoTokenizer
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

"""
spacy.load() returns a language model object containing all components and data needed to process text. It is usually called nlp. Calling the nlp object on a string of text will return a processed Doc
"""
nlp = spacy.load("en_core_web_sm") #spacy's trained pipeline model

from warnings import filterwarnings as filt
filt('ignore')

class QuestionGenerator:
    def __init__(self,path,device,model_max_length):
        self.model=T5ForConditionalGeneration.from_pretrained(path)
        self.tokenizer=AutoTokenizer.from_pretrained(path,model_max_length=model_max_length)
        self.device=torch.device(device)

    def preprocess(self,data):
        preprocess_text=data.strip().replace('\n','')
        return preprocess_text    

    def gen_question(self,data,answer):
        data=self.preprocess(data)
        t5_prepared_data=f'context: {data} answer: {answer}'
        encoding=self.tokenizer.encode_plus(t5_prepared_data,max_length=512,pad_to_max_length=True,truncation=True,return_tensors='pt').to(self.device)
        input_ids,attention_mask=encoding['input_ids'],encoding['attention_mask']
        output=self.model.generate(input_ids,
                                        attention_mask=attention_mask,
                                        num_beams=4,
                                        num_return_sequences=1,
                                        no_repeat_ngram_size=2,
                                        min_length=30,
                                        max_length=512,
                                        early_stopping=True)

        dec=[self.tokenizer.decode(ids,skip_special_tokens=True) for ids in output]
        Question=dec[0].replace("question:","").strip()                              
        return Question
class KeywordGenerator:
    def __init__(self,path,device):
        self.bert_model=BertModel.from_pretrained(path)
        self.bert_tokenizer=BertTokenizer.from_pretrained(path)
        self.sentence_model=SentenceTransformer('distilbert-base-nli-mean-tokens')
        self.device=torch.device(device)

    def get_embedding(self):
        """
        Token Embedding 
        txt = '[CLS] ' + doc + ' [SEP]' where CLS (used for classification task) is the token for the start of the sentence and SEP is the token for the end of the sentence and doc is the document to be encoded. 
        Ex: Sentence A : Paris is a beautiful city.
            Sentence B : I love Paris.
            tokens =[[cls] , Paris, is , a , beautiful , city ,[sep] , I , love , Paris ]
            Before feeding the tokens to the Bert we convert the tokens into embeddings using an embedding layer called token embedding layer.
        """
        tokens=self.bert_tokenizer.tokenize(txt)
        token_idx = self.bert_tokenizer.convert_tokens_to_ids(tokens)

        """
        Segment Embedding
        Segment embedding is used to distinguish between the two gives sentences.The segment embedding layer returns only either of the two embedding EA(embedding of Sentence A) or EB(embedding of Sentence B) i.e if the input token belongs to sentence A then EA else EB for sentence B.
        """
        segment_ids=[1]*len(token_idx) #This is the segment_ids for the document. [1]*len(token_idxs) is a list of 1s of length len(token_idxs).

        torch_token = torch.tensor([token_idx])
        torch_segment = torch.tensor([segment_ids])
        return self.bert_model(torch_token,torch_segment)[-1].detach().numpy() # 

    def get_posTags(self,context):
        """This function returns the POS tags of the words in the context. Uses Spacy's POS tagger"""
        doc=nlp(context)
        doc_pos=[document.pos_ for document in doc]
        return doc_pos,context.split()

    def get_sentence(self,context):
        """This function returns the sentences in the context. Uses Spacy's sentence tokenizer"""
        doc=nlp(context)
        return list(doc.sents)

    def get_vector(self,doc):
        """
        Machines cannot understand characters and words. So when dealing with text data we need to represent it in numbers to be understood by the machine. Countvectorizer is a method to convert text to numerical data.
        """
        stop_words="english" #This is the list of stop words that we want to remove from the text
        n_gram_range=(1,1) # This is the n-gram range. (1,1)->(unigram,unigram), (1,2)->(unigram,bigram), (1,3)->(unigram,trigram), (2,2)->(bigram,bigram) etc.
        df=CountVectorizer(stop_words=stop_words,ngram_range=n_gram_range).fit([doc])
        return df.get_feature_names() #This returns the list of words in the text.

    def get_key_words(self,context,module_type='t'):
        """
        module_type: 't' for token, 's' for sentence, 'v' for vector
        """
        keywords=[]
        top_n=5
        for txt in self.get_sentence(context):
            keyword=self.get_vector(str(txt))
            print(f'vectors: {keyword}')
            if module_type=='t':
                doc_embedding=self.get_embedding(str(txt))
                keyword_embedding=self.get_embedding(' '.join(keyword))
            else:
                doc_embedding=self.sentence_model.encode([str(txt)])
                keyword_embedding=self.sentence_model.encode(keyword)

            distances=cosine_similarity(doc_embedding,keyword_embedding)
            print(distances)
            keywords+=[(keyword[index],str(txt)) for index in distances.argsort()[0][-top_n:]]

        return keywords     

txt = """Enter text"""
for ans, context in KeywordGenerator('bert-base-uncased','cpu').get_key_words(txt,'st'):
  print(QuestionGenerator('ramsrigouthamg/t5_squad_v1','cpu',512).gen_question(context, ans))
  print()