File size: 6,125 Bytes
32d9382 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
from TextSummarization import T5_Base
import spacy
import torch
from transformers import BertTokenizer, BertModel
from transformers import T5ForConditionalGeneration, T5Tokenizer, BertTokenizer, BertModel, AutoTokenizer
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
"""
spacy.load() returns a language model object containing all components and data needed to process text. It is usually called nlp. Calling the nlp object on a string of text will return a processed Doc
"""
nlp = spacy.load("en_core_web_sm") #spacy's trained pipeline model
from warnings import filterwarnings as filt
filt('ignore')
class QuestionGenerator:
def __init__(self,path,device,model_max_length):
self.model=T5ForConditionalGeneration.from_pretrained(path)
self.tokenizer=AutoTokenizer.from_pretrained(path,model_max_length=model_max_length)
self.device=torch.device(device)
def preprocess(self,data):
preprocess_text=data.strip().replace('\n','')
return preprocess_text
def gen_question(self,data,answer):
data=self.preprocess(data)
t5_prepared_data=f'context: {data} answer: {answer}'
encoding=self.tokenizer.encode_plus(t5_prepared_data,max_length=512,pad_to_max_length=True,truncation=True,return_tensors='pt').to(self.device)
input_ids,attention_mask=encoding['input_ids'],encoding['attention_mask']
output=self.model.generate(input_ids,
attention_mask=attention_mask,
num_beams=4,
num_return_sequences=1,
no_repeat_ngram_size=2,
min_length=30,
max_length=512,
early_stopping=True)
dec=[self.tokenizer.decode(ids,skip_special_tokens=True) for ids in output]
Question=dec[0].replace("question:","").strip()
return Question
class KeywordGenerator:
def __init__(self,path,device):
self.bert_model=BertModel.from_pretrained(path)
self.bert_tokenizer=BertTokenizer.from_pretrained(path)
self.sentence_model=SentenceTransformer('distilbert-base-nli-mean-tokens')
self.device=torch.device(device)
def get_embedding(self):
"""
Token Embedding
txt = '[CLS] ' + doc + ' [SEP]' where CLS (used for classification task) is the token for the start of the sentence and SEP is the token for the end of the sentence and doc is the document to be encoded.
Ex: Sentence A : Paris is a beautiful city.
Sentence B : I love Paris.
tokens =[[cls] , Paris, is , a , beautiful , city ,[sep] , I , love , Paris ]
Before feeding the tokens to the Bert we convert the tokens into embeddings using an embedding layer called token embedding layer.
"""
tokens=self.bert_tokenizer.tokenize(txt)
token_idx = self.bert_tokenizer.convert_tokens_to_ids(tokens)
"""
Segment Embedding
Segment embedding is used to distinguish between the two gives sentences.The segment embedding layer returns only either of the two embedding EA(embedding of Sentence A) or EB(embedding of Sentence B) i.e if the input token belongs to sentence A then EA else EB for sentence B.
"""
segment_ids=[1]*len(token_idx) #This is the segment_ids for the document. [1]*len(token_idxs) is a list of 1s of length len(token_idxs).
torch_token = torch.tensor([token_idx])
torch_segment = torch.tensor([segment_ids])
return self.bert_model(torch_token,torch_segment)[-1].detach().numpy() #
def get_posTags(self,context):
"""This function returns the POS tags of the words in the context. Uses Spacy's POS tagger"""
doc=nlp(context)
doc_pos=[document.pos_ for document in doc]
return doc_pos,context.split()
def get_sentence(self,context):
"""This function returns the sentences in the context. Uses Spacy's sentence tokenizer"""
doc=nlp(context)
return list(doc.sents)
def get_vector(self,doc):
"""
Machines cannot understand characters and words. So when dealing with text data we need to represent it in numbers to be understood by the machine. Countvectorizer is a method to convert text to numerical data.
"""
stop_words="english" #This is the list of stop words that we want to remove from the text
n_gram_range=(1,1) # This is the n-gram range. (1,1)->(unigram,unigram), (1,2)->(unigram,bigram), (1,3)->(unigram,trigram), (2,2)->(bigram,bigram) etc.
df=CountVectorizer(stop_words=stop_words,ngram_range=n_gram_range).fit([doc])
return df.get_feature_names() #This returns the list of words in the text.
def get_key_words(self,context,module_type='t'):
"""
module_type: 't' for token, 's' for sentence, 'v' for vector
"""
keywords=[]
top_n=5
for txt in self.get_sentence(context):
keyword=self.get_vector(str(txt))
print(f'vectors: {keyword}')
if module_type=='t':
doc_embedding=self.get_embedding(str(txt))
keyword_embedding=self.get_embedding(' '.join(keyword))
else:
doc_embedding=self.sentence_model.encode([str(txt)])
keyword_embedding=self.sentence_model.encode(keyword)
distances=cosine_similarity(doc_embedding,keyword_embedding)
print(distances)
keywords+=[(keyword[index],str(txt)) for index in distances.argsort()[0][-top_n:]]
return keywords
txt = """Enter text"""
for ans, context in KeywordGenerator('bert-base-uncased','cpu').get_key_words(txt,'st'):
print(QuestionGenerator('ramsrigouthamg/t5_squad_v1','cpu',512).gen_question(context, ans))
print()
|