|
|
|
"""QuestionGenerator.ipynb |
|
|
|
Automatically generated by Colaboratory. |
|
|
|
Original file is located at |
|
https://colab.research.google.com/drive/1k0AavzSaNYxe36bk65fsXWzC4xSSwo6X |
|
""" |
|
|
|
from textwrap3 import wrap |
|
|
|
text = """A Lion lay asleep in the forest, his great head resting on his paws. A timid little Mouse came upon him unexpectedly, and in her fright and haste to |
|
get away, ran across the Lion's nose. Roused from his nap, the Lion laid his huge paw angrily on the tiny creature to kill her. "Spare me!" begged |
|
the poor Mouse. "Please let me go and some day I will surely repay you." The Lion was much amused to think that a Mouse could ever help him. But he |
|
was generous and finally let the Mouse go. Some days later, while stalking his prey in the forest, the Lion was caught in the toils of a hunter's |
|
net. Unable to free himself, he filled the forest with his angry roaring. The Mouse knew the voice and quickly found the Lion struggling in the net. |
|
Running to one of the great ropes that bound him, she gnawed it until it parted, and soon the Lion was free. "You laughed when I said I would repay |
|
you," said the Mouse. "Now you see that even a Mouse can help a Lion." """ |
|
for wrp in wrap(text, 150): |
|
print (wrp) |
|
print ("\n") |
|
|
|
import torch |
|
from transformers import T5ForConditionalGeneration,T5Tokenizer |
|
summary_model = T5ForConditionalGeneration.from_pretrained('t5-base') |
|
summary_tokenizer = T5Tokenizer.from_pretrained('t5-base') |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
summary_model = summary_model.to(device) |
|
|
|
import random |
|
import numpy as np |
|
|
|
def set_seed(seed: int): |
|
random.seed(seed) |
|
np.random.seed(seed) |
|
torch.manual_seed(seed) |
|
torch.cuda.manual_seed_all(seed) |
|
|
|
set_seed(42) |
|
|
|
import nltk |
|
nltk.download('punkt') |
|
nltk.download('brown') |
|
nltk.download('wordnet') |
|
from nltk.corpus import wordnet as wn |
|
from nltk.tokenize import sent_tokenize |
|
|
|
def postprocesstext (content): |
|
final="" |
|
for sent in sent_tokenize(content): |
|
sent = sent.capitalize() |
|
final = final +" "+sent |
|
return final |
|
|
|
|
|
def summarizer(text,model,tokenizer): |
|
text = text.strip().replace("\n"," ") |
|
text = "summarize: "+text |
|
|
|
max_len = 512 |
|
encoding = tokenizer.encode_plus(text,max_length=max_len, pad_to_max_length=False,truncation=True, return_tensors="pt").to(device) |
|
|
|
input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"] |
|
|
|
outs = model.generate(input_ids=input_ids, |
|
attention_mask=attention_mask, |
|
early_stopping=True, |
|
num_beams=3, |
|
num_return_sequences=1, |
|
no_repeat_ngram_size=2, |
|
min_length = 75, |
|
max_length=300) |
|
|
|
|
|
dec = [tokenizer.decode(ids,skip_special_tokens=True) for ids in outs] |
|
summary = dec[0] |
|
summary = postprocesstext(summary) |
|
summary= summary.strip() |
|
|
|
return summary |
|
|
|
|
|
summarized_text = summarizer(text,summary_model,summary_tokenizer) |
|
|
|
|
|
print ("\noriginal Text >>") |
|
for wrp in wrap(text, 150): |
|
print (wrp) |
|
print ("\n") |
|
print ("Summarized Text >>") |
|
for wrp in wrap(summarized_text, 150): |
|
print (wrp) |
|
print ("\n") |
|
|
|
total = 10 |
|
|
|
"""# **Answer Span Extraction (Keywords and Noun Phrases)**""" |
|
|
|
import nltk |
|
nltk.download('stopwords') |
|
from nltk.corpus import stopwords |
|
import string |
|
import pke |
|
import traceback |
|
|
|
|
|
def get_nouns_multipartite(content): |
|
out=[] |
|
try: |
|
|
|
extractor = pke.unsupervised.MultipartiteRank() |
|
|
|
extractor.load_document(input=content,language='en') |
|
|
|
|
|
|
|
pos = {'PROPN','NOUN'} |
|
|
|
stoplist = list(string.punctuation) |
|
stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-'] |
|
stoplist += stopwords.words('english') |
|
|
|
extractor.candidate_selection(pos=pos) |
|
|
|
|
|
|
|
extractor.candidate_weighting(alpha=1.1, |
|
threshold=0.75, |
|
method='average') |
|
keyphrases = extractor.get_n_best(n=15) |
|
|
|
|
|
for val in keyphrases: |
|
out.append(val[0]) |
|
except: |
|
out = [] |
|
traceback.print_exc() |
|
|
|
return out |
|
|
|
from flashtext import KeywordProcessor |
|
|
|
def get_keywords(originaltext,summarytext,total): |
|
keywords = get_nouns_multipartite(originaltext) |
|
print ("keywords unsummarized: ",keywords) |
|
keyword_processor = KeywordProcessor() |
|
for keyword in keywords: |
|
keyword_processor.add_keyword(keyword) |
|
|
|
keywords_found = keyword_processor.extract_keywords(summarytext) |
|
keywords_found = list(set(keywords_found)) |
|
print ("keywords_found in summarized: ",keywords_found) |
|
|
|
important_keywords =[] |
|
for keyword in keywords: |
|
if keyword in keywords_found: |
|
important_keywords.append(keyword) |
|
|
|
return important_keywords[:total] |
|
|
|
|
|
imp_keywords = get_keywords(text,summarized_text,total) |
|
print (imp_keywords) |
|
|
|
"""# **Question generation using T5**""" |
|
|
|
question_model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_squad_v1') |
|
question_tokenizer = T5Tokenizer.from_pretrained('ramsrigouthamg/t5_squad_v1') |
|
question_model = question_model.to(device) |
|
|
|
def get_question(context,answer,model,tokenizer): |
|
text = "context: {} answer: {}".format(context,answer) |
|
encoding = tokenizer.encode_plus(text,max_length=384, pad_to_max_length=False,truncation=True, return_tensors="pt").to(device) |
|
input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"] |
|
|
|
outs = model.generate(input_ids=input_ids, |
|
attention_mask=attention_mask, |
|
early_stopping=True, |
|
num_beams=5, |
|
num_return_sequences=1, |
|
no_repeat_ngram_size=2, |
|
max_length=72) |
|
|
|
|
|
dec = [tokenizer.decode(ids,skip_special_tokens=True) for ids in outs] |
|
|
|
|
|
Question = dec[0].replace("question:","") |
|
Question= Question.strip() |
|
return Question |
|
|
|
|
|
|
|
for wrp in wrap(summarized_text, 150): |
|
print (wrp) |
|
print ("\n") |
|
|
|
for answer in imp_keywords: |
|
ques = get_question(summarized_text,answer,question_model,question_tokenizer) |
|
print (ques) |
|
print (answer.capitalize()) |
|
print ("\n") |
|
|
|
"""# **UI by using Gradio**""" |
|
|
|
import mysql.connector |
|
import datetime; |
|
|
|
mydb = mysql.connector.connect( |
|
host="qtechdb-1.cexugk1h8rui.ap-northeast-1.rds.amazonaws.com", |
|
user="admin", |
|
password="F3v2vGWzb8vaniE3nqzi", |
|
database="spring_social" |
|
) |
|
|
|
import gradio as gr |
|
|
|
|
|
context = gr.Textbox(lines=10, placeholder="Enter paragraph/content here...", label="Text") |
|
total = gr.Slider(1,10, value=1,step=1, label="Total Number Of Questions") |
|
subject = gr.Textbox(placeholder="Enter subject/title here...", label="Text") |
|
|
|
output = gr.Markdown( label="Question and Answers") |
|
|
|
|
|
def generate_question_text(context,subject,total): |
|
summary_text = summarizer(context,summary_model,summary_tokenizer) |
|
for wrp in wrap(summary_text, 150): |
|
print (wrp) |
|
np = get_keywords(context,summary_text,total) |
|
print ("\n\nNoun phrases",np) |
|
output="<b style='color:black;'>Answer the following short questions.</b><br><br>" |
|
i=1 |
|
for answer in np: |
|
ques = get_question(summary_text,answer,question_model,question_tokenizer) |
|
|
|
output = output + "<b style='color:black;'>Q"+ str(i) + ") " + ques + "</b><br>" |
|
|
|
output = output + "<br>" |
|
i += 1 |
|
|
|
output = output + "<br><b style='color:black;'>" + "Correct Answer Key:</b><br>" |
|
|
|
i=1 |
|
for answer in np: |
|
output = output + "<b style='color:green;'>" + "Ans"+ str(i) + ": " +answer.capitalize()+ "</b>" |
|
output = output + "<br>" |
|
i += 1 |
|
|
|
mycursor = mydb.cursor() |
|
timedate = datetime.datetime.now() |
|
|
|
sql = "INSERT INTO shorttexts (subject, input, output, timedate) VALUES (%s,%s, %s,%s)" |
|
val = (subject, context, output, timedate) |
|
mycursor.execute(sql, val) |
|
|
|
mydb.commit() |
|
|
|
print(mycursor.rowcount, "record inserted.") |
|
|
|
return output |
|
|
|
iface = gr.Interface( |
|
fn=generate_question_text, |
|
inputs=[context,subject, total], |
|
outputs=output, css=".gradio-container {background-image: url('file=blue.jpg')}", |
|
allow_flagging="manual",flagging_options=["Save Data"]) |
|
|
|
|
|
|
|
def generate_question(context,subject,total): |
|
summary_text = summarizer(context,summary_model,summary_tokenizer) |
|
for wrp in wrap(summary_text, 150): |
|
print (wrp) |
|
np = get_keywords(context,summary_text,total) |
|
print ("\n\nNoun phrases",np) |
|
output="<b style='color:black;'>Answer the following short questions.</b><br><br>" |
|
i=1 |
|
for answer in np: |
|
ques = get_question(summary_text,answer,question_model,question_tokenizer) |
|
|
|
output = output + "<b style='color:black;'>Q"+ str(i) + ") " + ques + "</b><br>" |
|
|
|
output = output + "<br>" |
|
i += 1 |
|
|
|
output = output + "<br><b style='color:black;'>" + "Correct Answer Key:</b><br>" |
|
|
|
i=1 |
|
for answer in np: |
|
output = output + "<b style='color:green;'>" + "Ans"+ str(i) + ": " +answer.capitalize()+ "</b>" |
|
output = output + "<br>" |
|
i += 1 |
|
|
|
return output |
|
|
|
import glob |
|
import os.path |
|
import pandas as pd |
|
|
|
file =None |
|
|
|
def filecreate(x,subject,total): |
|
|
|
with open(x.name) as fo: |
|
text = fo.read() |
|
|
|
generated = generate_question(text,subject, total) |
|
|
|
mycursor = mydb.cursor() |
|
|
|
timedate= datetime.datetime.now() |
|
|
|
sql = "INSERT INTO shortfiles (subject, input, output, timedate) VALUES (%s,%s, %s,%s)" |
|
val = (subject, text, generated, timedate) |
|
mycursor.execute(sql, val) |
|
|
|
mydb.commit() |
|
|
|
print(mycursor.rowcount, "record inserted.") |
|
|
|
return generated |
|
|
|
|
|
|
|
import gradio as gr |
|
|
|
context = gr.HTML(label="Text") |
|
file = gr.File() |
|
subject = gr.Textbox(placeholder="Enter subject/title here...", label="Text") |
|
total = gr.Slider(1,10, value=1,step=1, label="Total Number Of Questions") |
|
|
|
|
|
|
|
|
|
fface = gr.Interface( |
|
fn=filecreate, |
|
inputs=[file,subject,total], |
|
outputs=context, |
|
css=".gradio-container {background-image: url('file=blue.jpg')}", |
|
allow_flagging="manual",flagging_options=["Save Data"]) |
|
|
|
|
|
|
|
|
|
demo = gr.TabbedInterface([iface, fface], ["Text", "Upload File"], css=".gradio-container {background-image: url('file=blue.jpg')}") |
|
demo.launch(debug=True, share=True) |