import streamlit as st from transformers import T5ForConditionalGeneration, T5Tokenizer import spacy import nltk from sklearn.feature_extraction.text import TfidfVectorizer from rake_nltk import Rake import pandas as pd from fpdf import FPDF import wikipediaapi #from b import b nltk.download('punkt') nltk.download('stopwords') nltk.download('brown') from nltk.tokenize import sent_tokenize # Load spaCy model nlp = spacy.load("en_core_web_sm") # Initialize Wikipedia API with a user agent user_agent = 'QGen/1.0 (channingfisher7@gmail.com)' wiki_wiki = wikipediaapi.Wikipedia(user_agent= user_agent,language='en') # Load T5 model and tokenizer model_name = "DevBM/t5-large-squad" model = T5ForConditionalGeneration.from_pretrained(model_name) tokenizer = T5Tokenizer.from_pretrained(model_name) # Function to extract keywords using combined techniques def extract_keywords(text): # Use RAKE rake = Rake() rake.extract_keywords_from_text(text) rake_keywords = set(rake.get_ranked_phrases()) # Use spaCy for NER and POS tagging doc = nlp(text) spacy_keywords = set([ent.text for ent in doc.ents]) spacy_keywords.update([token.text for token in doc if token.pos_ in ["NOUN", "PROPN", "VERB", "ADJ"]]) # Use TF-IDF vectorizer = TfidfVectorizer(stop_words='english') X = vectorizer.fit_transform([text]) tfidf_keywords = set(vectorizer.get_feature_names_out()) # Combine all keywords combined_keywords = rake_keywords.union(spacy_keywords).union(tfidf_keywords) return list(combined_keywords) # Function to map keywords to sentences with customizable context window size def map_keywords_to_sentences(text, keywords, context_window_size): sentences = sent_tokenize(text) keyword_sentence_mapping = {} for keyword in keywords: for i, sentence in enumerate(sentences): if keyword in sentence: # Combine current sentence with surrounding sentences for context start = max(0, i - context_window_size) end = min(len(sentences), i + context_window_size + 1) context = ' '.join(sentences[start:end]) if keyword not in keyword_sentence_mapping: keyword_sentence_mapping[keyword] = context else: keyword_sentence_mapping[keyword] += ' ' + context return keyword_sentence_mapping # Function to perform entity linking using Wikipedia API def entity_linking(keyword): page = wiki_wiki.page(keyword) if page.exists(): return page.fullurl return None # Function to generate questions using beam search def generate_question(context, answer, num_beams=5): input_text = f" {context} {answer}" input_ids = tokenizer.encode(input_text, return_tensors='pt') outputs = model.generate(input_ids, num_beams=num_beams, early_stopping=True) question = tokenizer.decode(outputs[0], skip_special_tokens=True) return question # Function to export questions to CSV def export_to_csv(data): df = pd.DataFrame(data, columns=["Context", "Answer", "Question"]) csv = df.to_csv(index=False,encoding='utf-8') return csv # Function to export questions to PDF def export_to_pdf(data): pdf = FPDF() pdf.add_page() pdf.set_font("Arial", size=12) for context, answer, question in data: pdf.multi_cell(0, 10, f"Context: {context}") pdf.multi_cell(0, 10, f"Answer: {answer}") pdf.multi_cell(0, 10, f"Question: {question}") pdf.ln(10) # pdf.output("questions.pdf") return pdf.output(name='questions.pdf',dest='S').encode('latin1') # Streamlit interface st.title(":blue[Question Generator from Text]") text = st.text_area("Enter text here:", value="Joe Biden, the current US president is on a weak wicket going in for his reelection later this November against former President Donald Trump.") # Customization options num_beams = st.slider("Select number of beams for question generation", min_value=1, max_value=10, value=5) context_window_size = st.slider("Select context window size (number of sentences before and after)", min_value=1, max_value=5, value=1) num_questions = st.slider("Select number of questions to generate", min_value=1, max_value=1000, value=5) question_complexity = st.selectbox("Select question complexity", ["Simple", "Intermediate", "Complex"]) downlaod_csv = st.toggle('Download CSV',value=True) download_pdf = st.toggle('Download PDF',value=True) if st.button("Generate Questions"): if text: keywords = extract_keywords(text) keyword_sentence_mapping = map_keywords_to_sentences(text, keywords, context_window_size) st.subheader("Generated Questions:") data = [] for i, (keyword, context) in enumerate(keyword_sentence_mapping.items()): if i >= num_questions: break linked_entity = entity_linking(keyword) question = generate_question(context, keyword, num_beams=num_beams) st.write(f"**Context:** {context}") st.write(f"**Answer:** {keyword}") st.write(f"**Question:** {question}") if linked_entity: st.write(f"**Entity Link:** {linked_entity}") st.write("---") data.append((context, keyword, question)) # Export buttons if data is not None: # if st.button("Export to CSV"): # if downlaod_csv: # csv_data = export_to_csv(data) # st.success("Questions exported to questions.csv") # st.download_button(label="Download CSV", data=csv_data, file_name='questions.csv', mime='text/csv') csv_data = export_to_csv(data) st.download_button(label="Download CSV", data=csv_data, file_name='questions.csv', mime='text/csv') # if st.button("Export to PDF"): # if download_pdf: # pdf_data = export_to_pdf(data) # st.success("Questions exported to questions.pdf") # st.download_button(label="Download PDF", data=pdf_data, file_name='questions.pdf', mime='application/pdf') pdf_data = export_to_pdf(data) st.download_button(label="Download PDF", data=pdf_data, file_name='questions.pdf', mime='application/pdf') else: st.write("Please enter some text to generate questions.")