Spaces:

HemanthSai7
/

IntelligentQuestionGenerator

Sleeping

IntelligentQuestionGenerator / src /PreviousVersionCode /QuestionGenerator.py

G.Hemanth Sai

qgen

32d9382 about 2 years ago

6.13 kB

	from TextSummarization import T5_Base

	import spacy
	import torch
	from transformers import BertTokenizer, BertModel
	from transformers import T5ForConditionalGeneration, T5Tokenizer, BertTokenizer, BertModel, AutoTokenizer
	from sentence_transformers import SentenceTransformer
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.metrics.pairwise import cosine_similarity

	"""
	spacy.load() returns a language model object containing all components and data needed to process text. It is usually called nlp. Calling the nlp object on a string of text will return a processed Doc
	"""
	nlp = spacy.load("en_core_web_sm") #spacy's trained pipeline model

	from warnings import filterwarnings as filt
	filt('ignore')

	class QuestionGenerator:
	def __init__(self,path,device,model_max_length):
	self.model=T5ForConditionalGeneration.from_pretrained(path)
	self.tokenizer=AutoTokenizer.from_pretrained(path,model_max_length=model_max_length)
	self.device=torch.device(device)

	def preprocess(self,data):
	preprocess_text=data.strip().replace('\n','')
	return preprocess_text

	def gen_question(self,data,answer):
	data=self.preprocess(data)
	t5_prepared_data=f'context: {data} answer: {answer}'
	encoding=self.tokenizer.encode_plus(t5_prepared_data,max_length=512,pad_to_max_length=True,truncation=True,return_tensors='pt').to(self.device)
	input_ids,attention_mask=encoding['input_ids'],encoding['attention_mask']
	output=self.model.generate(input_ids,
	attention_mask=attention_mask,
	num_beams=4,
	num_return_sequences=1,
	no_repeat_ngram_size=2,
	min_length=30,
	max_length=512,
	early_stopping=True)

	dec=[self.tokenizer.decode(ids,skip_special_tokens=True) for ids in output]
	Question=dec[0].replace("question:","").strip()
	return Question
	class KeywordGenerator:
	def __init__(self,path,device):
	self.bert_model=BertModel.from_pretrained(path)
	self.bert_tokenizer=BertTokenizer.from_pretrained(path)
	self.sentence_model=SentenceTransformer('distilbert-base-nli-mean-tokens')
	self.device=torch.device(device)

	def get_embedding(self):
	"""
	Token Embedding
	txt = '[CLS] ' + doc + ' [SEP]' where CLS (used for classification task) is the token for the start of the sentence and SEP is the token for the end of the sentence and doc is the document to be encoded.
	Ex: Sentence A : Paris is a beautiful city.
	Sentence B : I love Paris.
	tokens =[[cls] , Paris, is , a , beautiful , city ,[sep] , I , love , Paris ]
	Before feeding the tokens to the Bert we convert the tokens into embeddings using an embedding layer called token embedding layer.
	"""
	tokens=self.bert_tokenizer.tokenize(txt)
	token_idx = self.bert_tokenizer.convert_tokens_to_ids(tokens)

	"""
	Segment Embedding
	Segment embedding is used to distinguish between the two gives sentences.The segment embedding layer returns only either of the two embedding EA(embedding of Sentence A) or EB(embedding of Sentence B) i.e if the input token belongs to sentence A then EA else EB for sentence B.
	"""
	segment_ids=[1]len(token_idx) #This is the segment_ids for the document. [1]len(token_idxs) is a list of 1s of length len(token_idxs).

	torch_token = torch.tensor([token_idx])
	torch_segment = torch.tensor([segment_ids])
	return self.bert_model(torch_token,torch_segment)[-1].detach().numpy() #

	def get_posTags(self,context):
	"""This function returns the POS tags of the words in the context. Uses Spacy's POS tagger"""
	doc=nlp(context)
	doc_pos=[document.pos_ for document in doc]
	return doc_pos,context.split()

	def get_sentence(self,context):
	"""This function returns the sentences in the context. Uses Spacy's sentence tokenizer"""
	doc=nlp(context)
	return list(doc.sents)

	def get_vector(self,doc):
	"""
	Machines cannot understand characters and words. So when dealing with text data we need to represent it in numbers to be understood by the machine. Countvectorizer is a method to convert text to numerical data.
	"""
	stop_words="english" #This is the list of stop words that we want to remove from the text
	n_gram_range=(1,1) # This is the n-gram range. (1,1)->(unigram,unigram), (1,2)->(unigram,bigram), (1,3)->(unigram,trigram), (2,2)->(bigram,bigram) etc.
	df=CountVectorizer(stop_words=stop_words,ngram_range=n_gram_range).fit([doc])
	return df.get_feature_names() #This returns the list of words in the text.

	def get_key_words(self,context,module_type='t'):
	"""
	module_type: 't' for token, 's' for sentence, 'v' for vector
	"""
	keywords=[]
	top_n=5
	for txt in self.get_sentence(context):
	keyword=self.get_vector(str(txt))
	print(f'vectors: {keyword}')
	if module_type=='t':
	doc_embedding=self.get_embedding(str(txt))
	keyword_embedding=self.get_embedding(' '.join(keyword))
	else:
	doc_embedding=self.sentence_model.encode([str(txt)])
	keyword_embedding=self.sentence_model.encode(keyword)

	distances=cosine_similarity(doc_embedding,keyword_embedding)
	print(distances)
	keywords+=[(keyword[index],str(txt)) for index in distances.argsort()[0][-top_n:]]

	return keywords

	txt = """Enter text"""
	for ans, context in KeywordGenerator('bert-base-uncased','cpu').get_key_words(txt,'st'):
	print(QuestionGenerator('ramsrigouthamg/t5_squad_v1','cpu',512).gen_question(context, ans))
	print()