|
|
import nltk |
|
|
import ntlk_utils |
|
|
from nltk.corpus import wordnet as wn |
|
|
from nltk.tokenize import sent_tokenize |
|
|
from nltk.corpus import stopwords |
|
|
from time import sleep |
|
|
|
|
|
from flashtext import KeywordProcessor |
|
|
from pprint import pprint |
|
|
import random |
|
|
import pke |
|
|
import traceback |
|
|
|
|
|
import json |
|
|
import requests |
|
|
import string |
|
|
import re |
|
|
import string |
|
|
import itertools |
|
|
|
|
|
import streamlit as st |
|
|
from transformers import T5ForConditionalGeneration,T5Tokenizer |
|
|
|
|
|
from transformers import pipeline |
|
|
|
|
|
import torch |
|
|
import random |
|
|
import numpy as np |
|
|
|
|
|
def set_seed(seed: int): |
|
|
random.seed(seed) |
|
|
np.random.seed(seed) |
|
|
torch.manual_seed(seed) |
|
|
torch.cuda.manual_seed_all(seed) |
|
|
|
|
|
set_seed(42) |
|
|
|
|
|
summary_model = T5ForConditionalGeneration.from_pretrained('t5-base') |
|
|
summary_tokenizer = T5Tokenizer.from_pretrained('t5-base') |
|
|
|
|
|
question_model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_squad_v1') |
|
|
question_tokenizer = T5Tokenizer.from_pretrained('ramsrigouthamg/t5_squad_v1') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st.header(" Question Creation ") |
|
|
st.subheader(" Enter the text and click on generate question. Questions will be created automatically.") |
|
|
text = st.text_area("Input the text to get questions",placeholder="Enter the text", height=200) |
|
|
button = st.button("Generate Question") |
|
|
|
|
|
def postprocesstext (content): |
|
|
final="" |
|
|
for sent in sent_tokenize(content): |
|
|
sent = sent.capitalize() |
|
|
final = final +" "+sent |
|
|
return final |
|
|
|
|
|
|
|
|
def summarizer(text,model,tokenizer): |
|
|
text = text.strip().replace("\n"," ") |
|
|
text = "summarize: "+text |
|
|
print (text) |
|
|
max_len = 512 |
|
|
encoding = tokenizer.encode_plus(text,max_length=max_len, pad_to_max_length=False,truncation=True, return_tensors="pt").to(device) |
|
|
|
|
|
input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"] |
|
|
|
|
|
outs = model.generate(input_ids=input_ids, |
|
|
attention_mask=attention_mask, |
|
|
early_stopping=True, |
|
|
num_beams=3, |
|
|
num_return_sequences=1, |
|
|
no_repeat_ngram_size=2, |
|
|
min_length = 75, |
|
|
max_length=300) |
|
|
|
|
|
|
|
|
dec = [tokenizer.decode(ids,skip_special_tokens=True) for ids in outs] |
|
|
summary = dec[0] |
|
|
summary = postprocesstext(summary) |
|
|
summary= summary.strip() |
|
|
print( "done from summarizer") |
|
|
return summary |
|
|
|
|
|
|
|
|
def get_nouns_multipartite(content): |
|
|
out=[] |
|
|
try: |
|
|
extractor = pke.unsupervised.MultipartiteRank() |
|
|
extractor.load_document(input=content,language='en') |
|
|
|
|
|
pos = {'PROPN','NOUN'} |
|
|
|
|
|
stoplist = list(string.punctuation) |
|
|
stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-'] |
|
|
stoplist += stopwords.words('english') |
|
|
|
|
|
extractor.candidate_selection(pos=pos) |
|
|
|
|
|
|
|
|
|
|
|
extractor.candidate_weighting(alpha=1.1, |
|
|
threshold=0.75, |
|
|
method='average') |
|
|
keyphrases = extractor.get_n_best(n=15) |
|
|
|
|
|
|
|
|
for val in keyphrases: |
|
|
out.append(val[0]) |
|
|
except: |
|
|
out = [] |
|
|
traceback.print_exc() |
|
|
|
|
|
return out |
|
|
|
|
|
def get_keywords(originaltext,summarytext): |
|
|
keywords = get_nouns_multipartite(originaltext) |
|
|
print ("keywords unsummarized: ",keywords) |
|
|
keyword_processor = KeywordProcessor() |
|
|
for keyword in keywords: |
|
|
keyword_processor.add_keyword(keyword) |
|
|
|
|
|
keywords_found = keyword_processor.extract_keywords(summarytext) |
|
|
keywords_found = list(set(keywords_found)) |
|
|
print ("keywords_found in summarized: ",keywords_found) |
|
|
|
|
|
important_keywords =[] |
|
|
for keyword in keywords: |
|
|
if keyword in keywords_found: |
|
|
important_keywords.append(keyword) |
|
|
|
|
|
return important_keywords[:4] |
|
|
|
|
|
def get_question(context,answer,model,tokenizer): |
|
|
text = "context: {} answer: {}".format(context,answer) |
|
|
encoding = tokenizer.encode_plus(text,max_length=384, pad_to_max_length=False,truncation=True, return_tensors="pt").to(device) |
|
|
input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"] |
|
|
|
|
|
outs = model.generate(input_ids=input_ids, |
|
|
attention_mask=attention_mask, |
|
|
early_stopping=True, |
|
|
num_beams=5, |
|
|
num_return_sequences=1, |
|
|
no_repeat_ngram_size=2, |
|
|
max_length=72) |
|
|
|
|
|
|
|
|
dec = [tokenizer.decode(ids,skip_special_tokens=True) for ids in outs] |
|
|
|
|
|
|
|
|
Question = dec[0].replace("question:","") |
|
|
Question= Question.strip() |
|
|
return Question |
|
|
|
|
|
|
|
|
if text and button: |
|
|
|
|
|
summarized_text = summarizer(text,summary_model,summary_tokenizer) |
|
|
puts ("stopping pankaj") |
|
|
sleep(0.5) |
|
|
puts("summry",summarized_text) |
|
|
|
|
|
|
|
|
imp_keywords = get_keywords(text,summarized_text) |
|
|
for answer in imp_keywords: |
|
|
ques = get_question(summarized_text,answer,question_model,question_tokenizer) |
|
|
st.write(ques) |
|
|
st.write(answer.capitalize()) |
|
|
st.write("\n") |