import streamlit as st from transformers import T5ForConditionalGeneration, T5Tokenizer import spacy import nltk from sklearn.feature_extraction.text import TfidfVectorizer from rake_nltk import Rake import pandas as pd from fpdf import FPDF import wikipediaapi from functools import lru_cache nltk.download('punkt') nltk.download('stopwords') nltk.download('brown') from nltk.tokenize import sent_tokenize nltk.download('wordnet') from nltk.corpus import wordnet import random from sense2vec import Sense2Vec import sense2vec from wordcloud import WordCloud import matplotlib.pyplot as plt print("***************************************************************") st.set_page_config( page_title="Question Generator", initial_sidebar_state="collapsed", ) # Load spaCy model nlp = spacy.load("en_core_web_md") # s2v = Sense2Vec.from_disk(self=Sense2Vec,path='s2v_old') s2v = sense2vec.Sense2Vec().from_disk('s2v_old') # Initialize Wikipedia API with a user agent user_agent = 'QGen/1.0 (channingfisher7@gmail.com)' wiki_wiki = wikipediaapi.Wikipedia(user_agent= user_agent,language='en') @st.cache_resource def load_model(): model_name = "DevBM/t5-large-squad" model = T5ForConditionalGeneration.from_pretrained(model_name) tokenizer = T5Tokenizer.from_pretrained(model_name) return model, tokenizer # Function to extract keywords using combined techniques def extract_keywords(text, extract_all): doc = nlp(text) spacy_keywords = set([ent.text for ent in doc.ents]) spacy_entities = spacy_keywords print(f"\n\nSpacy Entities: {spacy_entities} \n\n") # Use Only Spacy Entities if extract_all is False: return list(spacy_entities) # Use RAKE rake = Rake() rake.extract_keywords_from_text(text) rake_keywords = set(rake.get_ranked_phrases()) print(f"\n\nRake Keywords: {rake_keywords} \n\n") # Use spaCy for NER and POS tagging spacy_keywords.update([token.text for token in doc if token.pos_ in ["NOUN", "PROPN", "VERB", "ADJ"]]) print(f"\n\nSpacy Keywords: {spacy_keywords} \n\n") # Use TF-IDF vectorizer = TfidfVectorizer(stop_words='english') X = vectorizer.fit_transform([text]) tfidf_keywords = set(vectorizer.get_feature_names_out()) print(f"\n\nTFIDF Entities: {tfidf_keywords} \n\n") # Combine all keywords combined_keywords = rake_keywords.union(spacy_keywords).union(tfidf_keywords) return list(combined_keywords) def get_similar_words_sense2vec(word, n=3): # Try to find the word with its most likely part-of-speech word_with_pos = word + "|NOUN" if word_with_pos in s2v: similar_words = s2v.most_similar(word_with_pos, n=n) return [word.split("|")[0] for word, _ in similar_words] # If not found, try without POS if word in s2v: similar_words = s2v.most_similar(word, n=n) return [word.split("|")[0] for word, _ in similar_words] return [] def get_synonyms(word, n=3): synonyms = [] for syn in wordnet.synsets(word): for lemma in syn.lemmas(): if lemma.name() != word and lemma.name() not in synonyms: synonyms.append(lemma.name()) if len(synonyms) == n: return synonyms return synonyms def generate_options(answer, context, n=3): options = [answer] # Try to get similar words based on sense2vec similar_words = get_similar_words_sense2vec(answer, n) options.extend(similar_words) # If we don't have enough options, try synonyms if len(options) < n + 1: synonyms = get_synonyms(answer, n - len(options) + 1) options.extend(synonyms) # If we still don't have enough options, extract other entities from the context if len(options) < n + 1: doc = nlp(context) entities = [ent.text for ent in doc.ents if ent.text.lower() != answer.lower()] options.extend(entities[:n - len(options) + 1]) # If we still need more options, add some random words from the context if len(options) < n + 1: context_words = [token.text for token in nlp(context) if token.is_alpha and token.text.lower() != answer.lower()] options.extend(random.sample(context_words, min(n - len(options) + 1, len(context_words)))) # Ensure we have the correct number of unique options options = list(dict.fromkeys(options))[:n+1] # Shuffle the options random.shuffle(options) return options # Function to map keywords to sentences with customizable context window size def map_keywords_to_sentences(text, keywords, context_window_size): sentences = sent_tokenize(text) keyword_sentence_mapping = {} for keyword in keywords: for i, sentence in enumerate(sentences): if keyword in sentence: # Combine current sentence with surrounding sentences for context start = max(0, i - context_window_size) end = min(len(sentences), i + context_window_size + 1) context = ' '.join(sentences[start:end]) if keyword not in keyword_sentence_mapping: keyword_sentence_mapping[keyword] = context else: keyword_sentence_mapping[keyword] += ' ' + context return keyword_sentence_mapping # Function to perform entity linking using Wikipedia API @lru_cache(maxsize=128) def entity_linking(keyword): page = wiki_wiki.page(keyword) if page.exists(): return page.fullurl return None # Function to generate questions using beam search def generate_question(context, answer, num_beams): input_text = f" {context} {answer}" input_ids = tokenizer.encode(input_text, return_tensors='pt') outputs = model.generate(input_ids, num_beams=num_beams, early_stopping=True) question = tokenizer.decode(outputs[0], skip_special_tokens=True) return question # Function to export questions to CSV def export_to_csv(data): df = pd.DataFrame(data, columns=["Context", "Answer", "Question", "Options"]) csv = df.to_csv(index=False,encoding='utf-8') return csv # Function to export questions to PDF def export_to_pdf(data): pdf = FPDF() pdf.add_page() pdf.set_font("Arial", size=12) for context, answer, question, options in data: pdf.multi_cell(0, 10, f"Context: {context}") pdf.multi_cell(0, 10, f"Answer: {answer}") pdf.multi_cell(0, 10, f"Question: {question}") pdf.ln(10) # pdf.output("questions.pdf") return pdf.output(name='questions.pdf',dest='S').encode('latin1') def display_word_cloud(generated_questions): word_frequency = {} for question in generated_questions: words = question.split() for word in words: word_frequency[word] = word_frequency.get(word, 0) + 1 wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_frequency) plt.figure(figsize=(10, 5)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') st.pyplot() if 'data' not in st.session_state: st.session_state.data = None # Streamlit interface st.title(":blue[Question Generator from Text]") text = st.text_area("Enter text here:", value="Joe Biden, the current US president is on a weak wicket going in for his reelection later this November against former President Donald Trump.") with st.sidebar: st.subheader("Customization Options") # Customization options num_beams = st.slider("Select number of beams for question generation", min_value=1, max_value=10, value=5) context_window_size = st.slider("Select context window size (number of sentences before and after)", min_value=1, max_value=5, value=1) num_questions = st.slider("Select number of questions to generate", min_value=1, max_value=1000, value=5) with st.expander("Choose the Additional Elements to show"): show_context = st.checkbox("Context",True) show_answer = st.checkbox("Answer",True) show_options = st.checkbox("Options",False) show_entity_link = st.checkbox("Enitity Link For Wikipedia",True) extract_all_keywords = st.toggle("Extract max Keywords",value=False) if st.button("Generate Questions"): if text: model, tokenizer = load_model() keywords = extract_keywords(text,extract_all_keywords) print(f"\n\nFinal Keywords in Main Function: {keywords}\n\n") keyword_sentence_mapping = map_keywords_to_sentences(text, keywords, context_window_size) st.subheader("Generated Questions:",divider='blue') data = [] for i, (keyword, context) in enumerate(keyword_sentence_mapping.items()): if i >= num_questions: break linked_entity = entity_linking(keyword) question = generate_question(context, keyword, num_beams=num_beams) options = generate_options(keyword, context) st.subheader(body=f":orange[Q{i+1}:] {question}") if show_context is True: st.write(f"**Context:** {context}") if show_answer is True: st.write(f"**Answer:** {keyword}") if show_options is True: st.write(f"**Options:**") for j, option in enumerate(options): st.write(f"{chr(65+j)}. {option}") if show_entity_link is True: if linked_entity: st.write(f"**Entity Link:** {linked_entity}") st.write("---") data.append((context, keyword, question, options)) # Add the data to session state st.session_state.data = data # display_word_cloud() print(data) # Export buttons if st.session_state.data is not None: with st.sidebar: st.subheader('Download Content') csv_data = export_to_csv(data) st.download_button(label="CSV Format", data=csv_data, file_name='questions.csv', mime='text/csv') pdf_data = export_to_pdf(data) st.download_button(label="PDF Format", data=pdf_data, file_name='questions.pdf', mime='application/pdf') if st.session_state.data is not None: st.markdown("You can download the data from the sidebar.") else: st.write("Please enter some text to generate questions.") print("********************************************************************************")