import streamlit as st from transformers import T5Tokenizer, T5ForConditionalGeneration from sense2vec import Sense2Vec import spacy import pandas as pd # Initialize T5 model and tokenizer for question generation tokenizer = T5Tokenizer.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap") model = T5ForConditionalGeneration.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap") # Load SpaCy and Sense2Vec nlp = spacy.load("en_core_web_sm") s2v = Sense2Vec().from_disk('s2v_reddit_2015_md/s2v_old') # Function to get important keywords def get_imp_keywords(content): return [token.text for token in nlp(content) if token.pos_ in [ "PROPN", "NUM"]] # Function to generate a question def get_question(context, answer): prompt = f"answer: {answer} context: {context} " encoding = tokenizer.encode_plus(prompt, max_length=256, truncation=True, return_tensors="pt") outs = model.generate(input_ids=encoding["input_ids"], attention_mask=encoding["attention_mask"], early_stopping=True, num_beams=5, num_return_sequences=1, max_length=300) return tokenizer.decode(outs[0], skip_special_tokens=True).replace("question:", "").strip() # Function to get similar words using Sense2Vec def sense2vec_get_words(word): word = word.lower().replace(" ", "_") # Prepare word for Sense2Vec sense = s2v.get_best_sense(word) # Get best sense (e.g., 'apple|NOUN') if not sense: return [] # Return empty list if no sense found # Fetch most similar words, filter by similarity score, exclude the input word, and format them return [ each_word[0].split("|")[0].replace("_", " ").title() for each_word in s2v.most_similar(sense, n=20) if each_word[0].split("|")[0] != word ] # Generate multiple questions and return as DataFrame def generate_questions(context, num_questions=5): questions_data = [] for idx, answer in enumerate(get_imp_keywords(context)): distractors = sense2vec_get_words(answer.capitalize()) if distractors: question = get_question(context, answer) options = [answer.capitalize()] + distractors[-3:] question_dict = { "Question": question, "Answer": answer.capitalize(), "Option 1": options[0], "Option 2": options[1] if len(options) > 1 else "", "Option 3": options[2] if len(options) > 2 else "", "Option 4": options[3] if len(options) > 3 else "" } questions_data.append(question_dict) if idx >= num_questions - 1: break return pd.DataFrame(questions_data) # Streamlit Interface st.title("Text-based Question Generator") st.sidebar.title("Instructions") st.sidebar.write(""" 1. Enter text content in the text area below. 2. Set the number of questions you want to generate. 3. Click "Generate Questions" to view the questions. 4. Use the download button to save questions as a CSV file. """) # Text input for user to enter content text_input = st.text_area("Enter text content...", height=200) # Adjustable number of questions num_questions = st.slider("Number of Questions", min_value=1, max_value=10, value=5) if st.button("Generate Questions") and text_input: with st.spinner("Generating questions..."): questions_df = generate_questions(text_input, num_questions) st.write("Generated Questions") # Display each question in an accordion-style format for i, row in questions_df.iterrows(): with st.expander(f"Question {i + 1}"): st.write(f"**Question:** {row['Question']}") st.write(f"**Answer:** {row['Answer']}") st.write("**Options:**") for j in range(1, 5): option = row[f"Option {j}"] if option: st.write(f"{j}. {option}") # Add a CSV download button csv = questions_df.to_csv(index=False) st.download_button( label="Download Questions as CSV", data=csv, file_name="generated_questions.csv", mime="text/csv" )