MCQ-Generator / app.py
ashishraics's picture
requiremtn fix
3c150d8
import streamlit as st
from transformers import AutoTokenizer
from fastT5 import OnnxT5,get_onnx_runtime_sessions
from keywords import tokenize_sentence, get_multipartiterank_topics,get_topicrank_topics,get_yake_topics
from annotated_text import annotated_text
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords,wordnet
from nltk.tokenize import sent_tokenize
import string
import subprocess
import logging
import multiprocessing
total_threads=multiprocessing.cpu_count()
import onnxruntime as ort
# from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
try:
import pke
logging.error("importing pke info")
except:
logging.error("installing pke info")
subprocess.run(['pip3', 'install','git+https://github.com/boudinfl/pke.git'])
subprocess.run(['python3' ,'-m' ,'spacy' ,'download' ,'en'])
import pke
session_options_ort = ort.SessionOptions()
session_options_ort.intra_op_num_threads=1
session_options_ort.inter_op_num_threads=1
st.set_page_config( # Alternate names: setup_page, page, layout
layout="wide", # Can be "centered" or "wide". In the future also "dashboard", etc.
initial_sidebar_state="auto", # Can be "auto", "expanded", "collapsed"
page_title='None', # String or None. Strings get appended with "• Streamlit".
)
def set_page_title(title):
st.sidebar.markdown(unsafe_allow_html=True, body=f"""
<iframe height=0 srcdoc="<script>
const title = window.parent.document.querySelector('title') \
const oldObserver = window.parent.titleObserver
if (oldObserver) {{
oldObserver.disconnect()
}} \
const newObserver = new MutationObserver(function(mutations) {{
const target = mutations[0].target
if (target.text !== '{title}') {{
target.text = '{title}'
}}
}}) \
newObserver.observe(title, {{ childList: true }})
window.parent.titleObserver = newObserver \
title.text = '{title}'
</script>" />
""")
set_page_title('MCQ Generator')
import yaml
def read_yaml(file_path):
with open(file_path, "r") as f:
return yaml.safe_load(f)
config = read_yaml('config.yaml')
t5_chkpt=config['t5_normal']['chkpt']
t5_model_path= config['t5_normal']['model_path']
t5_tokenizer= config['t5_normal']['tokenizer']
model_path_quanitzed=(f'{t5_model_path}/{t5_chkpt.split("/")[1]}-encoder-quantized.onnx',
f'{t5_model_path}/{t5_chkpt.split("/")[1]}-decoder-quantized.onnx',
f'{t5_model_path}/{t5_chkpt.split("/")[1]}-init-decoder-quantized.onnx'
)
model_session=get_onnx_runtime_sessions(model_paths=model_path_quanitzed,n_threads=1,parallel_exe_mode=False)
model_t5=OnnxT5(model_or_model_path=t5_chkpt,onnx_model_sessions=model_session)
tokenizer_t5=AutoTokenizer.from_pretrained(t5_tokenizer)
def create_question_t5(model,tokenizer,context,answer,max_length=64):
input = "context: %s answer: %s </s>" % (context, answer)
features=tokenizer([input],return_tensors='pt')
output=model.generate(input_ids=features['input_ids'],
attention_mask=features['attention_mask'],
max_length=max_length,
num_beams=3)
return tokenizer.decode(output.squeeze(), skip_special_tokens=True)
def create_answers_t5(model,tokenizer,context,question,max_length=128):
input = "context: %s question: %s </s>" % (context, question)
features=tokenizer([input],return_tensors='pt')
output=model.generate(input_ids=features['input_ids'],
attention_mask=features['attention_mask'],
max_length=max_length,
num_beams=3)
return tokenizer.decode(output.squeeze(), skip_special_tokens=True)
default_context = """Another important distinction is between companies that build enterprise products (B2B - business to business) and companies that build customer products (B2C - business to consumer).
B2B companies build products for organizations. Examples of enterprise products are Customer relationship management (CRM) software, project management tools, database management systems, cloud hosting services, etc.
B2C companies build products for individuals. Examples of consumer products are social networks, search engines, ride-sharing services, health trackers, etc.
Many companies do both -- their products can be used by individuals but they also offer plans for enterprise users. For example, Google Drive can be used by anyone but they also have Google Drive for Enterprise.
Even if a B2C company doesn’t create products for enterprises directly, they might still need to sell to enterprises. For example, Facebook’s main product is used by individuals but they sell ads to enterprises. Some might argue that this makes Facebook users products, as famously quipped: “If you’re not paying for it, you’re not the customer; you’re the product being sold.”"""
default_answer = "companies"
input_context = st.text_area(label='Input paragraph', height=300, max_chars=1000, value=default_context)
c1,c2,c3=st.columns(3)
with c1:
create_usingkeyword = st.button("Create Questions using Keywords")
if create_usingkeyword:
tokenized_sent = tokenize_sentence(input_context)
keywords_noun_adj_verb = get_multipartiterank_topics(input_context)
t5_questions=[]
with st.spinner("Creating Questionsssss"):
for k in keywords_noun_adj_verb:
question = create_question_t5(model=model_t5,
tokenizer=tokenizer_t5,
context=input_context,
answer=k)
t5_questions.append(question.split('question:')[1])
for i,(quest,ans) in enumerate(zip(t5_questions,keywords_noun_adj_verb)):
st.write(f"{i + 1}: {quest}")
annotated_text("Answer is ", (ans, '', "#fea"))
st.markdown("---")
with c2:
create_usinglongformer = st.button("Create Questions using Longformer")
if create_usinglongformer:
pass