DevBM commited on
Commit
e84f648
1 Parent(s): 240f2d4

Reverting to Jul19 Commit

Browse files
Files changed (1) hide show
  1. app.py +614 -28
app.py CHANGED
@@ -1,10 +1,50 @@
 
 
 
1
  import nltk
 
 
 
 
 
 
2
  nltk.download('punkt')
3
  nltk.download('stopwords')
4
  nltk.download('brown')
 
5
  nltk.download('wordnet')
6
- import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
 
8
 
9
  st.set_page_config(
10
  page_icon='cyclone',
@@ -15,19 +55,62 @@ st.set_page_config(
15
  }
16
  )
17
 
 
18
 
19
- from text_processing import clean_text, get_pdf_text
20
- from question_generation import generate_questions_async
21
- from visualization import display_word_cloud
22
- from data_export import export_to_csv, export_to_pdf
23
- from feedback import collect_feedback, analyze_feedback, export_feedback_data
24
- from utils import get_session_id, initialize_state, get_state, set_state, display_info, QuestionGenerationError, entity_linking
25
- import asyncio
26
- import time
27
- import pandas as pd
28
- from data_export import send_email_with_attachment
29
 
30
- st.set_option('deprecation.showPyplotGlobalUse',False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  with st.sidebar:
33
  select_model = st.selectbox("Select Model", ("T5-large","T5-small"))
@@ -35,8 +118,514 @@ if select_model == "T5-large":
35
  modelname = "DevBM/t5-large-squad"
36
  elif select_model == "T5-small":
37
  modelname = "AneriThakkar/flan-t5-small-finetuned"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  def main():
 
40
  st.title(":blue[Question Generator System]")
41
  session_id = get_session_id()
42
  state = initialize_state(session_id)
@@ -44,18 +633,18 @@ def main():
44
  st.session_state.feedback_data = []
45
 
46
  with st.sidebar:
47
- show_info = st.toggle('Show Info',False)
48
  if show_info:
49
  display_info()
50
  st.subheader("Customization Options")
51
  # Customization options
52
  input_type = st.radio("Select Input Preference", ("Text Input","Upload PDF"))
53
  with st.expander("Choose the Additional Elements to show"):
54
- show_context = st.checkbox("Context",False)
55
  show_answer = st.checkbox("Answer",True)
56
- show_options = st.checkbox("Options",True)
57
  show_entity_link = st.checkbox("Entity Link For Wikipedia",True)
58
- show_qa_scores = st.checkbox("QA Score",True)
59
  show_blank_question = st.checkbox("Fill in the Blank Questions",True)
60
  num_beams = st.slider("Select number of beams for question generation", min_value=2, max_value=10, value=2)
61
  context_window_size = st.slider("Select context window size (number of sentences before and after)", min_value=1, max_value=5, value=1)
@@ -81,15 +670,15 @@ def main():
81
  text = clean_text(text)
82
  with st.expander("Show text"):
83
  st.write(text)
84
- # st.text(text)
85
  generate_questions_button = st.button("Generate Questions",help="This is the generate questions button")
86
  # st.markdown('<span aria-label="Generate questions button">Above is the generate questions button</span>', unsafe_allow_html=True)
87
 
 
88
  if generate_questions_button and text:
89
  start_time = time.time()
90
  with st.spinner("Generating questions..."):
91
  try:
92
- state['generated_questions'] = asyncio.run(generate_questions_async(text, num_questions, context_window_size, num_beams, extract_all_keywords,modelname))
93
  if not state['generated_questions']:
94
  st.warning("No questions were generated. The text might be too short or lack suitable content.")
95
  else:
@@ -150,16 +739,12 @@ def main():
150
  # Export buttons
151
  # if st.session_state.generated_questions:
152
  if state['generated_questions']:
153
- with st.sidebar:
154
- # Adding error handling while exporting the files
155
- # ---------------------------------------------------------------------
156
- try:
157
- csv_data = export_to_csv(state['generated_questions'])
158
- st.download_button(label="Download CSV", data=csv_data, file_name='questions.csv', mime='text/csv')
159
- pdf_data = export_to_pdf(state['generated_questions'])
160
- st.download_button(label="Download PDF", data=pdf_data, file_name='questions.pdf', mime='application/pdf')
161
- except Exception as e:
162
- st.error(f"Error exporting CSV: {e}")
163
 
164
  with st.expander("View Visualizations"):
165
  questions = [tpl['question'] for tpl in state['generated_questions']]
@@ -170,6 +755,7 @@ def main():
170
  overall_scores = pd.DataFrame(overall_scores,columns=['Overall Scores'])
171
  st.line_chart(overall_scores)
172
 
 
173
  # View Feedback Statistics
174
  with st.expander("View Feedback Statistics"):
175
  analyze_feedback()
 
1
+ import streamlit as st
2
+ from transformers import T5ForConditionalGeneration, T5Tokenizer
3
+ import spacy
4
  import nltk
5
+ from sklearn.feature_extraction.text import TfidfVectorizer
6
+ from rake_nltk import Rake
7
+ import pandas as pd
8
+ from fpdf import FPDF
9
+ import wikipediaapi
10
+ from functools import lru_cache
11
  nltk.download('punkt')
12
  nltk.download('stopwords')
13
  nltk.download('brown')
14
+ from nltk.tokenize import sent_tokenize
15
  nltk.download('wordnet')
16
+ from nltk.corpus import wordnet
17
+ import random
18
+ import sense2vec
19
+ from wordcloud import WordCloud
20
+ import matplotlib.pyplot as plt
21
+ import json
22
+ import os
23
+ from sentence_transformers import SentenceTransformer, util
24
+ import textstat
25
+ from spellchecker import SpellChecker
26
+ from transformers import pipeline
27
+ import re
28
+ import pymupdf
29
+ import uuid
30
+ import time
31
+ import asyncio
32
+ import aiohttp
33
+ from datetime import datetime
34
+ import base64
35
+ from io import BytesIO
36
+ # '-----------------'
37
+ import smtplib
38
+ from email.mime.multipart import MIMEMultipart
39
+ from email.mime.text import MIMEText
40
+ from email.mime.base import MIMEBase
41
+ from email.mime.application import MIMEApplication
42
+ from email import encoders
43
+ # '------------------'
44
+ from gliner import GLiNER
45
+ # -------------------
46
 
47
+ print("***************************************************************")
48
 
49
  st.set_page_config(
50
  page_icon='cyclone',
 
55
  }
56
  )
57
 
58
+ st.set_option('deprecation.showPyplotGlobalUse',False)
59
 
60
+ class QuestionGenerationError(Exception):
61
+ """Custom exception for question generation errors."""
62
+ pass
 
 
 
 
 
 
 
63
 
64
+
65
+ # Initialize Wikipedia API with a user agent
66
+ user_agent = 'QGen/1.2'
67
+ wiki_wiki = wikipediaapi.Wikipedia(user_agent= user_agent,language='en')
68
+
69
+ def get_session_id():
70
+ if 'session_id' not in st.session_state:
71
+ st.session_state.session_id = str(uuid.uuid4())
72
+ return st.session_state.session_id
73
+
74
+ def initialize_state(session_id):
75
+ if 'session_states' not in st.session_state:
76
+ st.session_state.session_states = {}
77
+
78
+ if session_id not in st.session_state.session_states:
79
+ st.session_state.session_states[session_id] = {
80
+ 'generated_questions': [],
81
+ # add other state variables as needed
82
+ }
83
+ return st.session_state.session_states[session_id]
84
+
85
+ def get_state(session_id):
86
+ return st.session_state.session_states[session_id]
87
+
88
+ def set_state(session_id, key, value):
89
+ st.session_state.session_states[session_id][key] = value
90
+
91
+
92
+ @st.cache_resource
93
+ def load_model(modelname):
94
+ model_name = modelname
95
+ model = T5ForConditionalGeneration.from_pretrained(model_name)
96
+ tokenizer = T5Tokenizer.from_pretrained(model_name)
97
+ return model, tokenizer
98
+
99
+ # Load Spacy Model
100
+ @st.cache_resource
101
+ def load_nlp_models():
102
+ nlp = spacy.load("en_core_web_md")
103
+ s2v = sense2vec.Sense2Vec().from_disk('s2v_old')
104
+ return nlp, s2v
105
+
106
+ # Load Quality Assurance Models
107
+ @st.cache_resource
108
+ def load_qa_models():
109
+ # Initialize BERT model for sentence similarity
110
+ similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
111
+
112
+ spell = SpellChecker()
113
+ return similarity_model, spell
114
 
115
  with st.sidebar:
116
  select_model = st.selectbox("Select Model", ("T5-large","T5-small"))
 
118
  modelname = "DevBM/t5-large-squad"
119
  elif select_model == "T5-small":
120
  modelname = "AneriThakkar/flan-t5-small-finetuned"
121
+ nlp, s2v = load_nlp_models()
122
+ similarity_model, spell = load_qa_models()
123
+ context_model = similarity_model
124
+ model, tokenizer = load_model(modelname)
125
+
126
+
127
+ # Info Section
128
+ def display_info():
129
+ st.sidebar.title("Information")
130
+ st.sidebar.markdown("""
131
+ ### Question Generator System
132
+ This system is designed to generate questions based on the provided context. It uses various NLP techniques and models to:
133
+ - Extract keywords from the text
134
+ - Map keywords to sentences
135
+ - Generate questions
136
+ - Provide multiple choice options
137
+ - Assess the quality of generated questions
138
+ #### Key Features:
139
+ - **Keyword Extraction:** Combines RAKE, TF-IDF, and spaCy for comprehensive keyword extraction.
140
+ - **Question Generation:** Utilizes a pre-trained T5 model for generating questions.
141
+ - **Options Generation:** Creates contextually relevant multiple-choice options.
142
+ - **Question Assessment:** Scores questions based on relevance, complexity, and spelling correctness.
143
+ - **Feedback Collection:** Allows users to rate the generated questions and provides statistics on feedback.
144
+ #### Customization Options:
145
+ - Number of beams for question generation
146
+ - Context window size for mapping keywords to sentences
147
+ - Number of questions to generate
148
+ - Additional display elements (context, answer, options, entity link, QA scores)
149
+ #### Outputs:
150
+ - Generated questions with multiple-choice options
151
+ - Download options for CSV and PDF formats
152
+ - Visualization of overall scores
153
+ """)
154
+
155
+ def get_pdf_text(pdf_file):
156
+ doc = pymupdf.open(stream=pdf_file.read(), filetype="pdf")
157
+ text = ""
158
+ for page_num in range(doc.page_count):
159
+ page = doc.load_page(page_num)
160
+ text += page.get_text()
161
+ return text
162
+
163
+ def save_feedback_og(question, answer, rating, options, context):
164
+ feedback_file = 'question_feedback.json'
165
+ if os.path.exists(feedback_file):
166
+ with open(feedback_file, 'r') as f:
167
+ feedback_data = json.load(f)
168
+ else:
169
+ feedback_data = []
170
+ tpl = {
171
+ 'question' : question,
172
+ 'answer' : answer,
173
+ 'context' : context,
174
+ 'options' : options,
175
+ 'rating' : rating,
176
+ }
177
+ # feedback_data[question] = rating
178
+ feedback_data.append(tpl)
179
+ print(feedback_data)
180
+ with open(feedback_file, 'w') as f:
181
+ json.dump(feedback_data, f)
182
+
183
+ return feedback_file
184
+
185
+ # -----------------------------------------------------------------------------------------
186
+ def send_email_with_attachment(email_subject, email_body, recipient_emails, sender_email, sender_password, attachment):
187
+ smtp_server = "smtp.gmail.com" # Replace with your SMTP server
188
+ smtp_port = 587 # Replace with your SMTP port
189
+
190
+ # Create the email message
191
+ message = MIMEMultipart()
192
+ message['From'] = sender_email
193
+ message['To'] = ", ".join(recipient_emails)
194
+ message['Subject'] = email_subject
195
+ message.attach(MIMEText(email_body, 'plain'))
196
+
197
+ # Attach the feedback data if available
198
+ if attachment:
199
+ attachment_part = MIMEApplication(attachment.getvalue(), Name="feedback_data.json")
200
+ attachment_part['Content-Disposition'] = f'attachment; filename="feedback_data.json"'
201
+ message.attach(attachment_part)
202
+
203
+ # Send the email
204
+ try:
205
+ with smtplib.SMTP(smtp_server, smtp_port) as server:
206
+ server.starttls()
207
+ print(sender_email)
208
+ print(sender_password)
209
+ server.login(sender_email, sender_password)
210
+ text = message.as_string()
211
+ server.sendmail(sender_email, recipient_emails, text)
212
+ return True
213
+ except Exception as e:
214
+ st.error(f"Failed to send email: {str(e)}")
215
+ return False
216
+ # ----------------------------------------------------------------------------------
217
+
218
+ def collect_feedback(i,question, answer, context, options):
219
+ st.write("Please provide feedback for this question:")
220
+ edited_question = st.text_input("Enter improved question",value=question,key=f'fdx1{i}')
221
+ clarity = st.slider("Clarity", 1, 5, 3, help="1 = Very unclear, 5 = Very clear",key=f'fdx2{i}')
222
+ difficulty = st.slider("Difficulty", 1, 5, 3, help="1 = Very easy, 5 = Very difficult",key=f'fdx3{i}')
223
+ relevance = st.slider("Relevance", 1, 5, 3, help="1 = Not relevant, 5 = Highly relevant",key=f'fdx4{i}')
224
+ option_quality = st.slider("Quality of Options", 1, 5, 3, help="1 = Poor options, 5 = Excellent options",key=f'fdx5{i}')
225
+ overall_rating = st.slider("Overall Rating", 1, 5, 3, help="1 = Poor, 5 = Excellent",key=f'fdx6{i}')
226
+ comments = st.text_input("Additional Comments", "",key=f'fdx7{i}')
227
+
228
+ if st.button("Submit Feedback",key=f'fdx8{i}'):
229
+ feedback = {
230
+ "question": question,
231
+ 'edited_question':edited_question,
232
+ "answer": answer,
233
+ "options": options,
234
+ "clarity": clarity,
235
+ "difficulty": difficulty,
236
+ "relevance": relevance,
237
+ "option_quality": option_quality,
238
+ "overall_rating": overall_rating,
239
+ "comments": comments
240
+ }
241
+ save_feedback(feedback)
242
+ st.success("Thank you for your feedback!")
243
+
244
+ def save_feedback(feedback):
245
+ st.session_state.feedback_data.append(feedback)
246
+
247
+ def analyze_feedback():
248
+ if not st.session_state.feedback_data:
249
+ st.warning("No feedback data available yet.")
250
+ return
251
+
252
+ df = pd.DataFrame(st.session_state.feedback_data)
253
+
254
+ st.write("Feedback Analysis")
255
+ st.write(f"Total feedback collected: {len(df)}")
256
+
257
+ metrics = ['clarity', 'difficulty', 'relevance', 'option_quality', 'overall_rating']
258
+
259
+ for metric in metrics:
260
+ fig, ax = plt.subplots()
261
+ df[metric].value_counts().sort_index().plot(kind='bar', ax=ax)
262
+ plt.title(f"Distribution of {metric.capitalize()} Ratings")
263
+ plt.xlabel("Rating")
264
+ plt.ylabel("Count")
265
+ st.pyplot(fig)
266
+
267
+ st.write("Average Ratings:")
268
+ st.write(df[metrics].mean())
269
+
270
+ # Word cloud of comments
271
+ comments = " ".join(df['comments'])
272
+ if len(comments) > 1:
273
+ wordcloud = WordCloud(width=800, height=400, background_color='white').generate(comments)
274
+ fig, ax = plt.subplots()
275
+ plt.imshow(wordcloud, interpolation='bilinear')
276
+ plt.axis("off")
277
+ st.pyplot(fig)
278
+
279
+
280
+ def export_feedback_data():
281
+ if not st.session_state.feedback_data:
282
+ st.warning("No feedback data available.")
283
+ return None
284
+
285
+ # Convert feedback data to JSON
286
+ json_data = json.dumps(st.session_state.feedback_data, indent=2)
287
+
288
+ # Create a BytesIO object
289
+ buffer = BytesIO()
290
+ buffer.write(json_data.encode())
291
+ buffer.seek(0)
292
+
293
+ return buffer
294
+
295
+ # Function to clean text
296
+ def clean_text(text):
297
+ text = re.sub(r"[^\x00-\x7F]", " ", text)
298
+ text = re.sub(f"[\n]"," ", text)
299
+ return text
300
+
301
+ # Function to create text chunks
302
+ def segment_text(text, max_segment_length=700, batch_size=7):
303
+ sentences = sent_tokenize(text)
304
+ segments = []
305
+ current_segment = ""
306
+
307
+ for sentence in sentences:
308
+ if len(current_segment) + len(sentence) <= max_segment_length:
309
+ current_segment += sentence + " "
310
+ else:
311
+ segments.append(current_segment.strip())
312
+ current_segment = sentence + " "
313
+
314
+ if current_segment:
315
+ segments.append(current_segment.strip())
316
+
317
+ # Create batches
318
+ batches = [segments[i:i + batch_size] for i in range(0, len(segments), batch_size)]
319
+ return batches
320
+
321
+
322
+ # Function to extract keywords using combined techniques
323
+ def extract_keywords(text, extract_all):
324
+ try:
325
+ gliner_model = GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5")
326
+ labels = ["person", "organization", "email", "Award", "Date", "Competitions", "Teams", "location", "percentage", "money"]
327
+ entities = gliner_model.predict_entities(text, labels, threshold=0.7)
328
+
329
+ gliner_keywords = list(set([ent["text"] for ent in entities]))
330
+ print(f"Gliner keywords:{gliner_keywords}")
331
+ # Use Only Gliner Entities
332
+ if extract_all is False:
333
+ return list(gliner_keywords)
334
+
335
+ doc = nlp(text)
336
+ spacy_keywords = set([ent.text for ent in doc.ents])
337
+ spacy_entities = spacy_keywords
338
+ print(f"\n\nSpacy Entities: {spacy_entities} \n\n")
339
+
340
+ #
341
+ # if extract_all is False:
342
+ # return list(spacy_entities)
343
+
344
+ # Use RAKE
345
+ rake = Rake()
346
+ rake.extract_keywords_from_text(text)
347
+ rake_keywords = set(rake.get_ranked_phrases())
348
+ print(f"\n\nRake Keywords: {rake_keywords} \n\n")
349
+ # Use spaCy for NER and POS tagging
350
+ spacy_keywords.update([token.text for token in doc if token.pos_ in ["NOUN", "PROPN", "VERB", "ADJ"]])
351
+ print(f"\n\nSpacy Keywords: {spacy_keywords} \n\n")
352
+ # Use TF-IDF
353
+ vectorizer = TfidfVectorizer(stop_words='english')
354
+ X = vectorizer.fit_transform([text])
355
+ tfidf_keywords = set(vectorizer.get_feature_names_out())
356
+ print(f"\n\nTFIDF Entities: {tfidf_keywords} \n\n")
357
+
358
+ # Combine all keywords
359
+ combined_keywords = rake_keywords.union(spacy_keywords).union(tfidf_keywords).union(gliner_keywords)
360
+
361
+ return list(combined_keywords)
362
+ except Exception as e:
363
+ raise QuestionGenerationError(f"Error in keyword extraction: {str(e)}")
364
+
365
+ def get_similar_words_sense2vec(word, n=3):
366
+ # Try to find the word with its most likely part-of-speech
367
+ word_with_pos = word + "|NOUN"
368
+ if word_with_pos in s2v:
369
+ similar_words = s2v.most_similar(word_with_pos, n=n)
370
+ return [word.split("|")[0] for word, _ in similar_words]
371
+
372
+ # If not found, try without POS
373
+ if word in s2v:
374
+ similar_words = s2v.most_similar(word, n=n)
375
+ return [word.split("|")[0] for word, _ in similar_words]
376
+
377
+ return []
378
+
379
+ def get_synonyms(word, n=3):
380
+ synonyms = []
381
+ for syn in wordnet.synsets(word):
382
+ for lemma in syn.lemmas():
383
+ if lemma.name() != word and lemma.name() not in synonyms:
384
+ synonyms.append(lemma.name())
385
+ if len(synonyms) == n:
386
+ return synonyms
387
+ return synonyms
388
+
389
+ def generate_options(answer, context, n=3):
390
+ options = [answer]
391
+
392
+ # Add contextually relevant words using a pre-trained model
393
+ context_embedding = context_model.encode(context)
394
+ answer_embedding = context_model.encode(answer)
395
+ context_words = [token.text for token in nlp(context) if token.is_alpha and token.text.lower() != answer.lower()]
396
+
397
+ # Compute similarity scores and sort context words
398
+ similarity_scores = [util.pytorch_cos_sim(context_model.encode(word), answer_embedding).item() for word in context_words]
399
+ sorted_context_words = [word for _, word in sorted(zip(similarity_scores, context_words), reverse=True)]
400
+ options.extend(sorted_context_words[:n])
401
+
402
+ # Try to get similar words based on sense2vec
403
+ similar_words = get_similar_words_sense2vec(answer, n)
404
+ options.extend(similar_words)
405
+
406
+ # If we don't have enough options, try synonyms
407
+ if len(options) < n + 1:
408
+ synonyms = get_synonyms(answer, n - len(options) + 1)
409
+ options.extend(synonyms)
410
+
411
+ # If we still don't have enough options, extract other entities from the context
412
+ if len(options) < n + 1:
413
+ doc = nlp(context)
414
+ entities = [ent.text for ent in doc.ents if ent.text.lower() != answer.lower()]
415
+ options.extend(entities[:n - len(options) + 1])
416
+
417
+ # If we still need more options, add some random words from the context
418
+ if len(options) < n + 1:
419
+ context_words = [token.text for token in nlp(context) if token.is_alpha and token.text.lower() != answer.lower()]
420
+ options.extend(random.sample(context_words, min(n - len(options) + 1, len(context_words))))
421
+ print(f"\n\nAll Possible Options: {options}\n\n")
422
+ # Ensure we have the correct number of unique options
423
+ options = list(dict.fromkeys(options))[:n+1]
424
+
425
+ # Shuffle the options
426
+ random.shuffle(options)
427
+
428
+ return options
429
+
430
+ # Function to map keywords to sentences with customizable context window size
431
+ def map_keywords_to_sentences(text, keywords, context_window_size):
432
+ sentences = sent_tokenize(text)
433
+ keyword_sentence_mapping = {}
434
+ print(f"\n\nSentences: {sentences}\n\n")
435
+ for keyword in keywords:
436
+ for i, sentence in enumerate(sentences):
437
+ if keyword in sentence:
438
+ # Combine current sentence with surrounding sentences for context
439
+ # start = max(0, i - context_window_size)
440
+ # end = min(len(sentences), i + context_window_size + 1)
441
+ start = max(0,i - context_window_size)
442
+ context_sentenses = sentences[start:i+1]
443
+ context = ' '.join(context_sentenses)
444
+ # context = ' '.join(sentences[start:end])
445
+ if keyword not in keyword_sentence_mapping:
446
+ keyword_sentence_mapping[keyword] = context
447
+ else:
448
+ keyword_sentence_mapping[keyword] += ' ' + context
449
+ return keyword_sentence_mapping
450
+
451
+
452
+ # Function to perform entity linking using Wikipedia API
453
+ @lru_cache(maxsize=128)
454
+ def entity_linking(keyword):
455
+ page = wiki_wiki.page(keyword)
456
+ if page.exists():
457
+ return page.fullurl
458
+ return None
459
+
460
+ async def generate_question_async(context, answer, num_beams):
461
+ try:
462
+ input_text = f"<context> {context} <answer> {answer}"
463
+ print(f"\n{input_text}\n")
464
+ input_ids = tokenizer.encode(input_text, return_tensors='pt')
465
+ outputs = await asyncio.to_thread(model.generate, input_ids, num_beams=num_beams, early_stopping=True, max_length=250)
466
+ question = tokenizer.decode(outputs[0], skip_special_tokens=True)
467
+ print(f"\n{question}\n")
468
+ return question
469
+ except Exception as e:
470
+ raise QuestionGenerationError(f"Error in question generation: {str(e)}")
471
+
472
+ async def generate_options_async(answer, context, n=3):
473
+ try:
474
+ options = [answer]
475
+
476
+ # Add contextually relevant words using a pre-trained model
477
+ context_embedding = await asyncio.to_thread(context_model.encode, context)
478
+ answer_embedding = await asyncio.to_thread(context_model.encode, answer)
479
+ context_words = [token.text for token in nlp(context) if token.is_alpha and token.text.lower() != answer.lower()]
480
+
481
+ # Compute similarity scores and sort context words
482
+ similarity_scores = [util.pytorch_cos_sim(await asyncio.to_thread(context_model.encode, word), answer_embedding).item() for word in context_words]
483
+ sorted_context_words = [word for _, word in sorted(zip(similarity_scores, context_words), reverse=True)]
484
+ options.extend(sorted_context_words[:n])
485
+
486
+ # Try to get similar words based on sense2vec
487
+ similar_words = await asyncio.to_thread(get_similar_words_sense2vec, answer, n)
488
+ options.extend(similar_words)
489
+
490
+ # If we don't have enough options, try synonyms
491
+ if len(options) < n + 1:
492
+ synonyms = await asyncio.to_thread(get_synonyms, answer, n - len(options) + 1)
493
+ options.extend(synonyms)
494
+
495
+ # Ensure we have the correct number of unique options
496
+ options = list(dict.fromkeys(options))[:n+1]
497
+
498
+ # Shuffle the options
499
+ random.shuffle(options)
500
+
501
+ return options
502
+ except Exception as e:
503
+ raise QuestionGenerationError(f"Error in generating options: {str(e)}")
504
+
505
+
506
+ # Function to generate questions using beam search
507
+ async def generate_questions_async(text, num_questions, context_window_size, num_beams, extract_all_keywords):
508
+ try:
509
+ batches = segment_text(text)
510
+ keywords = extract_keywords(text, extract_all_keywords)
511
+ all_questions = []
512
+
513
+ progress_bar = st.progress(0)
514
+ status_text = st.empty()
515
+
516
+ for i, batch in enumerate(batches):
517
+ status_text.text(f"Processing batch {i+1} of {len(batches)}...")
518
+ batch_questions = await process_batch(batch, keywords, context_window_size, num_beams)
519
+ all_questions.extend(batch_questions)
520
+ progress_bar.progress((i + 1) / len(batches))
521
+
522
+ if len(all_questions) >= num_questions:
523
+ break
524
+
525
+ progress_bar.empty()
526
+ status_text.empty()
527
+
528
+ return all_questions[:num_questions]
529
+ except QuestionGenerationError as e:
530
+ st.error(f"An error occurred during question generation: {str(e)}")
531
+ return []
532
+ except Exception as e:
533
+ st.error(f"An unexpected error occurred: {str(e)}")
534
+ return []
535
+
536
+ async def generate_fill_in_the_blank_questions(context,answer):
537
+ answerSize = len(answer)
538
+ replacedBlanks = ""
539
+ for i in range(answerSize):
540
+ replacedBlanks += "_"
541
+ blank_q = context.replace(answer,replacedBlanks)
542
+ return blank_q
543
+
544
+ async def process_batch(batch, keywords, context_window_size, num_beams):
545
+ questions = []
546
+ for text in batch:
547
+ keyword_sentence_mapping = map_keywords_to_sentences(text, keywords, context_window_size)
548
+ for keyword, context in keyword_sentence_mapping.items():
549
+ question = await generate_question_async(context, keyword, num_beams)
550
+ options = await generate_options_async(keyword, context)
551
+ blank_question = await generate_fill_in_the_blank_questions(context,keyword)
552
+ overall_score, relevance_score, complexity_score, spelling_correctness = assess_question_quality(context, question, keyword)
553
+ if overall_score >= 0.5:
554
+ questions.append({
555
+ "question": question,
556
+ "context": context,
557
+ "answer": keyword,
558
+ "options": options,
559
+ "overall_score": overall_score,
560
+ "relevance_score": relevance_score,
561
+ "complexity_score": complexity_score,
562
+ "spelling_correctness": spelling_correctness,
563
+ "blank_question": blank_question,
564
+ })
565
+ return questions
566
+
567
+ # Function to export questions to CSV
568
+ def export_to_csv(data):
569
+ # df = pd.DataFrame(data, columns=["Context", "Answer", "Question", "Options"])
570
+ df = pd.DataFrame(data)
571
+ # csv = df.to_csv(index=False,encoding='utf-8')
572
+ csv = df.to_csv(index=False)
573
+ return csv
574
+
575
+ # Function to export questions to PDF
576
+ def export_to_pdf(data):
577
+ pdf = FPDF()
578
+ pdf.add_page()
579
+ pdf.set_font("Arial", size=12)
580
+
581
+ for item in data:
582
+ pdf.multi_cell(0, 10, f"Context: {item['context']}")
583
+ pdf.multi_cell(0, 10, f"Question: {item['question']}")
584
+ pdf.multi_cell(0, 10, f"Answer: {item['answer']}")
585
+ pdf.multi_cell(0, 10, f"Options: {', '.join(item['options'])}")
586
+ pdf.multi_cell(0, 10, f"Overall Score: {item['overall_score']:.2f}")
587
+ pdf.ln(10)
588
+
589
+ return pdf.output(dest='S').encode('latin-1')
590
+
591
+ def display_word_cloud(generated_questions):
592
+ word_frequency = {}
593
+ for question in generated_questions:
594
+ words = question.split()
595
+ for word in words:
596
+ word_frequency[word] = word_frequency.get(word, 0) + 1
597
+
598
+ wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_frequency)
599
+ plt.figure(figsize=(10, 5))
600
+ plt.imshow(wordcloud, interpolation='bilinear')
601
+ plt.axis('off')
602
+ st.pyplot()
603
+
604
+
605
+ def assess_question_quality(context, question, answer):
606
+ # Assess relevance using cosine similarity
607
+ context_doc = nlp(context)
608
+ question_doc = nlp(question)
609
+ relevance_score = context_doc.similarity(question_doc)
610
+
611
+ # Assess complexity using token length (as a simple metric)
612
+ complexity_score = min(len(question_doc) / 20, 1) # Normalize to 0-1
613
+
614
+ # Assess Spelling correctness
615
+ misspelled = spell.unknown(question.split())
616
+ spelling_correctness = 1 - (len(misspelled) / len(question.split())) # Normalize to 0-1
617
+
618
+ # Calculate overall score (you can adjust weights as needed)
619
+ overall_score = (
620
+ 0.4 * relevance_score +
621
+ 0.4 * complexity_score +
622
+ 0.2 * spelling_correctness
623
+ )
624
+
625
+ return overall_score, relevance_score, complexity_score, spelling_correctness
626
 
627
  def main():
628
+ # Streamlit interface
629
  st.title(":blue[Question Generator System]")
630
  session_id = get_session_id()
631
  state = initialize_state(session_id)
 
633
  st.session_state.feedback_data = []
634
 
635
  with st.sidebar:
636
+ show_info = st.toggle('Show Info',True)
637
  if show_info:
638
  display_info()
639
  st.subheader("Customization Options")
640
  # Customization options
641
  input_type = st.radio("Select Input Preference", ("Text Input","Upload PDF"))
642
  with st.expander("Choose the Additional Elements to show"):
643
+ show_context = st.checkbox("Context",True)
644
  show_answer = st.checkbox("Answer",True)
645
+ show_options = st.checkbox("Options",False)
646
  show_entity_link = st.checkbox("Entity Link For Wikipedia",True)
647
+ show_qa_scores = st.checkbox("QA Score",False)
648
  show_blank_question = st.checkbox("Fill in the Blank Questions",True)
649
  num_beams = st.slider("Select number of beams for question generation", min_value=2, max_value=10, value=2)
650
  context_window_size = st.slider("Select context window size (number of sentences before and after)", min_value=1, max_value=5, value=1)
 
670
  text = clean_text(text)
671
  with st.expander("Show text"):
672
  st.write(text)
 
673
  generate_questions_button = st.button("Generate Questions",help="This is the generate questions button")
674
  # st.markdown('<span aria-label="Generate questions button">Above is the generate questions button</span>', unsafe_allow_html=True)
675
 
676
+ # if generate_questions_button:
677
  if generate_questions_button and text:
678
  start_time = time.time()
679
  with st.spinner("Generating questions..."):
680
  try:
681
+ state['generated_questions'] = asyncio.run(generate_questions_async(text, num_questions, context_window_size, num_beams, extract_all_keywords))
682
  if not state['generated_questions']:
683
  st.warning("No questions were generated. The text might be too short or lack suitable content.")
684
  else:
 
739
  # Export buttons
740
  # if st.session_state.generated_questions:
741
  if state['generated_questions']:
742
+ with st.sidebar:
743
+ csv_data = export_to_csv(state['generated_questions'])
744
+ st.download_button(label="Download CSV", data=csv_data, file_name='questions.csv', mime='text/csv')
745
+
746
+ pdf_data = export_to_pdf(state['generated_questions'])
747
+ st.download_button(label="Download PDF", data=pdf_data, file_name='questions.pdf', mime='application/pdf')
 
 
 
 
748
 
749
  with st.expander("View Visualizations"):
750
  questions = [tpl['question'] for tpl in state['generated_questions']]
 
755
  overall_scores = pd.DataFrame(overall_scores,columns=['Overall Scores'])
756
  st.line_chart(overall_scores)
757
 
758
+
759
  # View Feedback Statistics
760
  with st.expander("View Feedback Statistics"):
761
  analyze_feedback()