Spaces:

HemanthSai7
/

IntelligentQuestionGenerator

Sleeping

G.Hemanth Sai

qgen

32d9382 almost 3 years ago

7.66 kB

	import streamlit as st
	import pandas as pd
	from keybert import KeyBERT

	import seaborn as sns

	from src.Pipeline.TextSummarization import T5_Base
	from src.Pipeline.QuestGen import sense2vec_get_words,get_question


	st.title("❓ Intelligent Question Generator")
	st.header("")


	with st.expander("ℹ️ - About this app", expanded=True):

	st.write(
	"""
	- The Intelligent Question Generator app is an easy-to-use interface built in Streamlit which uses [KeyBERT](https://github.com/MaartenGr/KeyBERT), [Sense2vec](https://github.com/explosion/sense2vec), [T5](https://huggingface.co/ramsrigouthamg/t5_paraphraser)
	- It uses a minimal keyword extraction technique that leverages multiple NLP embeddings and relies on [Transformers](https://huggingface.co/transformers/) 🤗 to create keywords/keyphrases that are most similar to a document.
	- [sense2vec](https://github.com/explosion/sense2vec) (Trask et. al, 2015) is a nice twist on word2vec that lets you learn more interesting and detailed word vectors.
	"""
	)

	st.markdown("")

	st.markdown("")
	st.markdown("## 📌 Paste document ")

	with st.form(key="my_form"):
	ce, c1, ce, c2, c3 = st.columns([0.07, 2, 0.07, 5, 1])
	with c1:
	ModelType = st.radio(
	"Choose your model",
	["DistilBERT (Default)", "BERT", "RoBERTa", "ALBERT", "XLNet"],
	help="At present, you can choose 1 model ie DistilBERT to embed your text. More to come!",
	)

	if ModelType == "Default (DistilBERT)":
	# kw_model = KeyBERT(model=roberta)

	@st.cache(allow_output_mutation=True)
	def load_model(model):
	return KeyBERT(model=model)

	kw_model = load_model('roberta')

	else:
	@st.cache(allow_output_mutation=True)
	def load_model(model):
	return KeyBERT(model=model)

	kw_model = load_model("distilbert-base-nli-mean-tokens")

	top_N = st.slider(
	"# of results",
	min_value=1,
	max_value=30,
	value=10,
	help="You can choose the number of keywords/keyphrases to display. Between 1 and 30, default number is 10.",
	)
	min_Ngrams = st.number_input(
	"Minimum Ngram",
	min_value=1,
	max_value=4,
	help="""The minimum value for the ngram range.
	Keyphrase_ngram_range sets the length of the resulting keywords/keyphrases.To extract keyphrases, simply set keyphrase_ngram_range to (1, 2) or higher depending on the number of words you would like in the resulting keyphrases.""",
	# help="Minimum value for the keyphrase_ngram_range. keyphrase_ngram_range sets the length of the resulting keywords/keyphrases. To extract keyphrases, simply set keyphrase_ngram_range to (1, # 2) or higher depending on the number of words you would like in the resulting keyphrases.",
	)

	max_Ngrams = st.number_input(
	"Maximum Ngram",
	value=1,
	min_value=1,
	max_value=4,
	help="""The maximum value for the keyphrase_ngram_range.
	Keyphrase_ngram_range sets the length of the resulting keywords/keyphrases.
	To extract keyphrases, simply set keyphrase_ngram_range to (1, 2) or higher depending on the number of words you would like in the resulting keyphrases.""",
	)

	StopWordsCheckbox = st.checkbox(
	"Remove stop words",
	value=True,
	help="Tick this box to remove stop words from the document (currently English only)",
	)

	use_MMR = st.checkbox(
	"Use MMR",
	value=True,
	help="You can use Maximal Margin Relevance (MMR) to diversify the results. It creates keywords/keyphrases based on cosine similarity. Try high/low 'Diversity' settings below for interesting variations.",
	)

	Diversity = st.slider(
	"Keyword diversity (MMR only)",
	value=0.5,
	min_value=0.0,
	max_value=1.0,
	step=0.1,
	help="""The higher the setting, the more diverse the keywords.Note that the Keyword diversity slider only works if the MMR checkbox is ticked.""",
	)

	with c2:
	doc = st.text_area(
	"Paste your text below (max 500 words)",
	height=510,
	)

	MAX_WORDS = 500
	import re
	res = len(re.findall(r"\w+", doc))
	if res > MAX_WORDS:
	st.warning(
	"⚠️ Your text contains "
	+ str(res)
	+ " words."
	+ " Only the first 500 words will be reviewed. Stay tuned as increased allowance is coming! 😊"
	)

	doc = doc[:MAX_WORDS]
	# base=base=T5_Base("t5-base","cpu",2048)
	# doc=base.summarize(doc)

	submit_button = st.form_submit_button(label="✨ Get me the data!")

	if use_MMR:
	mmr = True
	else:
	mmr = False

	if StopWordsCheckbox:
	StopWords = "english"
	else:
	StopWords = None

	if min_Ngrams > max_Ngrams:
	st.warning("min_Ngrams can't be greater than max_Ngrams")
	st.stop()

	# Uses KeyBERT to extract the top keywords from a text
	# Arguments: text (str)
	# Returns: list of keywords (list)
	keywords = kw_model.extract_keywords(
	doc,
	keyphrase_ngram_range=(min_Ngrams, max_Ngrams),
	use_mmr=mmr,
	stop_words=StopWords,
	top_n=top_N,
	diversity=Diversity,
	)
	# print(keywords)

	st.markdown("## 🎈 Results ")

	st.header("")


	df = (
	pd.DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"])
	.sort_values(by="Relevancy", ascending=False)
	.reset_index(drop=True)
	)

	df.index += 1

	# Add styling
	cmGreen = sns.light_palette("green", as_cmap=True)
	cmRed = sns.light_palette("red", as_cmap=True)
	df = df.style.background_gradient(
	cmap=cmGreen,
	subset=[
	"Relevancy",
	],
	)

	c1, c2, c3 = st.columns([1, 3, 1])

	format_dictionary = {
	"Relevancy": "{:.2%}",
	}

	df = df.format(format_dictionary)

	with c2:
	st.table(df)

	with st.expander("Note about Quantitative Relevancy"):
	st.markdown(
	"""
	- The relevancy score is a quantitative measure of how relevant the keyword/keyphrase is to the document. It is calculated using cosine similarity. The higher the score, the more relevant the keyword/keyphrase is to the document.
	- So if you see a keyword/keyphrase with a high relevancy score, it means that it is a good keyword/keyphrase to use in question answering, generation ,summarization, and other NLP tasks.
	"""
	)

	with st.form(key="ques_form"):
	ice, ic1, ice, ic2 ,ic3= st.columns([0.07, 2, 0.07, 5,0.07])
	with ic1:
	TopN = st.slider(
	"Top N sense2vec results",
	value=20,
	min_value=0,
	max_value=50,
	step=1,
	help="""Get the n most similar terms.""",
	)

	with ic2:
	input_keyword = st.text_input("Paste any keyword generated above")
	keywrd_button = st.form_submit_button(label="✨ Get me the questions!")

	if keywrd_button:
	st.markdown("## 🎈 Questions ")
	ext_keywrds=sense2vec_get_words(TopN,input_keyword)
	if len(ext_keywrds)<1:
	st.warning("Sorry questions couldn't be generated")

	for answer in ext_keywrds:
	sentence_for_T5=" ".join(doc.split())
	ques=get_question(sentence_for_T5,answer)
	ques=ques.replace("<pad>","").replace("</s>","").replace("<s>","")
	st.markdown(f'> #### {ques} ')