|
from TextSummarization import T5_Base |
|
|
|
import spacy |
|
import torch |
|
from transformers import BertTokenizer, BertModel |
|
from transformers import T5ForConditionalGeneration, T5Tokenizer, BertTokenizer, BertModel, AutoTokenizer |
|
from sentence_transformers import SentenceTransformer |
|
from sklearn.feature_extraction.text import CountVectorizer |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
|
""" |
|
spacy.load() returns a language model object containing all components and data needed to process text. It is usually called nlp. Calling the nlp object on a string of text will return a processed Doc |
|
""" |
|
nlp = spacy.load("en_core_web_sm") |
|
|
|
from warnings import filterwarnings as filt |
|
filt('ignore') |
|
|
|
class QuestionGenerator: |
|
def __init__(self,path,device,model_max_length): |
|
self.model=T5ForConditionalGeneration.from_pretrained(path) |
|
self.tokenizer=AutoTokenizer.from_pretrained(path,model_max_length=model_max_length) |
|
self.device=torch.device(device) |
|
|
|
def preprocess(self,data): |
|
preprocess_text=data.strip().replace('\n','') |
|
return preprocess_text |
|
|
|
def gen_question(self,data,answer): |
|
data=self.preprocess(data) |
|
t5_prepared_data=f'context: {data} answer: {answer}' |
|
encoding=self.tokenizer.encode_plus(t5_prepared_data,max_length=512,pad_to_max_length=True,truncation=True,return_tensors='pt').to(self.device) |
|
input_ids,attention_mask=encoding['input_ids'],encoding['attention_mask'] |
|
output=self.model.generate(input_ids, |
|
attention_mask=attention_mask, |
|
num_beams=4, |
|
num_return_sequences=1, |
|
no_repeat_ngram_size=2, |
|
min_length=30, |
|
max_length=512, |
|
early_stopping=True) |
|
|
|
dec=[self.tokenizer.decode(ids,skip_special_tokens=True) for ids in output] |
|
Question=dec[0].replace("question:","").strip() |
|
return Question |
|
class KeywordGenerator: |
|
def __init__(self,path,device): |
|
self.bert_model=BertModel.from_pretrained(path) |
|
self.bert_tokenizer=BertTokenizer.from_pretrained(path) |
|
self.sentence_model=SentenceTransformer('distilbert-base-nli-mean-tokens') |
|
self.device=torch.device(device) |
|
|
|
def get_embedding(self): |
|
""" |
|
Token Embedding |
|
txt = '[CLS] ' + doc + ' [SEP]' where CLS (used for classification task) is the token for the start of the sentence and SEP is the token for the end of the sentence and doc is the document to be encoded. |
|
Ex: Sentence A : Paris is a beautiful city. |
|
Sentence B : I love Paris. |
|
tokens =[[cls] , Paris, is , a , beautiful , city ,[sep] , I , love , Paris ] |
|
Before feeding the tokens to the Bert we convert the tokens into embeddings using an embedding layer called token embedding layer. |
|
""" |
|
tokens=self.bert_tokenizer.tokenize(txt) |
|
token_idx = self.bert_tokenizer.convert_tokens_to_ids(tokens) |
|
|
|
""" |
|
Segment Embedding |
|
Segment embedding is used to distinguish between the two gives sentences.The segment embedding layer returns only either of the two embedding EA(embedding of Sentence A) or EB(embedding of Sentence B) i.e if the input token belongs to sentence A then EA else EB for sentence B. |
|
""" |
|
segment_ids=[1]*len(token_idx) |
|
|
|
torch_token = torch.tensor([token_idx]) |
|
torch_segment = torch.tensor([segment_ids]) |
|
return self.bert_model(torch_token,torch_segment)[-1].detach().numpy() |
|
|
|
def get_posTags(self,context): |
|
"""This function returns the POS tags of the words in the context. Uses Spacy's POS tagger""" |
|
doc=nlp(context) |
|
doc_pos=[document.pos_ for document in doc] |
|
return doc_pos,context.split() |
|
|
|
def get_sentence(self,context): |
|
"""This function returns the sentences in the context. Uses Spacy's sentence tokenizer""" |
|
doc=nlp(context) |
|
return list(doc.sents) |
|
|
|
def get_vector(self,doc): |
|
""" |
|
Machines cannot understand characters and words. So when dealing with text data we need to represent it in numbers to be understood by the machine. Countvectorizer is a method to convert text to numerical data. |
|
""" |
|
stop_words="english" |
|
n_gram_range=(1,1) |
|
df=CountVectorizer(stop_words=stop_words,ngram_range=n_gram_range).fit([doc]) |
|
return df.get_feature_names() |
|
|
|
def get_key_words(self,context,module_type='t'): |
|
""" |
|
module_type: 't' for token, 's' for sentence, 'v' for vector |
|
""" |
|
keywords=[] |
|
top_n=5 |
|
for txt in self.get_sentence(context): |
|
keyword=self.get_vector(str(txt)) |
|
print(f'vectors: {keyword}') |
|
if module_type=='t': |
|
doc_embedding=self.get_embedding(str(txt)) |
|
keyword_embedding=self.get_embedding(' '.join(keyword)) |
|
else: |
|
doc_embedding=self.sentence_model.encode([str(txt)]) |
|
keyword_embedding=self.sentence_model.encode(keyword) |
|
|
|
distances=cosine_similarity(doc_embedding,keyword_embedding) |
|
print(distances) |
|
keywords+=[(keyword[index],str(txt)) for index in distances.argsort()[0][-top_n:]] |
|
|
|
return keywords |
|
|
|
txt = """Enter text""" |
|
for ans, context in KeywordGenerator('bert-base-uncased','cpu').get_key_words(txt,'st'): |
|
print(QuestionGenerator('ramsrigouthamg/t5_squad_v1','cpu',512).gen_question(context, ans)) |
|
print() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|