Spaces:
Sleeping
Sleeping
Commit
•
f7842f6
1
Parent(s):
03f344d
Upload files for modules/functions (#5)
Browse files- Upload files for modules/functions (7fafac42e4b5d1e40338e2b5ace7ef9fba805bff)
Co-authored-by: Thakkar Aneri Pareshkumar <AneriThakkar@users.noreply.huggingface.co>
- data_export.py +61 -0
- feedback.py +108 -0
- fill_in_the_blanks_generation.py +8 -0
- keyword_extraction.py +133 -0
- load_models.py +45 -0
- mapping_keywords.py +22 -0
- option_generation.py +135 -0
- question_generation.py +122 -0
- text_processing.py +41 -0
- utils.py +75 -0
- visualization.py +16 -0
data_export.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from fpdf import FPDF
|
3 |
+
import streamlit as st
|
4 |
+
import smtplib
|
5 |
+
from email.mime.multipart import MIMEMultipart
|
6 |
+
from email.mime.text import MIMEText
|
7 |
+
# from email.mime.base import MIMEBase
|
8 |
+
from email.mime.application import MIMEApplication
|
9 |
+
# from email import encoders
|
10 |
+
|
11 |
+
|
12 |
+
def export_to_csv(data):
|
13 |
+
df = pd.DataFrame(data)
|
14 |
+
csv = df.to_csv(index=False)
|
15 |
+
return csv
|
16 |
+
|
17 |
+
def export_to_pdf(data):
|
18 |
+
pdf = FPDF()
|
19 |
+
pdf.add_page()
|
20 |
+
pdf.set_font("Arial", size=12)
|
21 |
+
|
22 |
+
for item in data:
|
23 |
+
pdf.multi_cell(0, 10, f"Context: {item['context']}")
|
24 |
+
pdf.multi_cell(0, 10, f"Question: {item['question']}")
|
25 |
+
pdf.multi_cell(0, 10, f"Answer: {item['answer']}")
|
26 |
+
pdf.multi_cell(0, 10, f"Options: {', '.join(item['options'])}")
|
27 |
+
pdf.multi_cell(0, 10, f"Overall Score: {item['overall_score']:.2f}")
|
28 |
+
pdf.ln(10)
|
29 |
+
|
30 |
+
return pdf.output(dest='S').encode('latin-1')
|
31 |
+
|
32 |
+
def send_email_with_attachment(email_subject, email_body, recipient_emails, sender_email, sender_password, attachment):
|
33 |
+
smtp_server = "smtp.gmail.com" # Replace with your SMTP server
|
34 |
+
smtp_port = 587 # Replace with your SMTP port
|
35 |
+
|
36 |
+
# Create the email message
|
37 |
+
message = MIMEMultipart()
|
38 |
+
message['From'] = sender_email
|
39 |
+
message['To'] = ", ".join(recipient_emails)
|
40 |
+
message['Subject'] = email_subject
|
41 |
+
message.attach(MIMEText(email_body, 'plain'))
|
42 |
+
|
43 |
+
# Attach the feedback data if available
|
44 |
+
if attachment:
|
45 |
+
attachment_part = MIMEApplication(attachment.getvalue(), Name="feedback_data.json")
|
46 |
+
attachment_part['Content-Disposition'] = f'attachment; filename="feedback_data.json"'
|
47 |
+
message.attach(attachment_part)
|
48 |
+
|
49 |
+
# Send the email
|
50 |
+
try:
|
51 |
+
with smtplib.SMTP(smtp_server, smtp_port) as server:
|
52 |
+
server.starttls()
|
53 |
+
# print(sender_email)
|
54 |
+
# print(sender_password)
|
55 |
+
server.login(sender_email, sender_password)
|
56 |
+
text = message.as_string()
|
57 |
+
server.sendmail(sender_email, recipient_emails, text)
|
58 |
+
return True
|
59 |
+
except Exception as e:
|
60 |
+
st.error(f"Failed to send email: {str(e)}")
|
61 |
+
return False
|
feedback.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import json
|
3 |
+
from io import BytesIO
|
4 |
+
import pandas as pd
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
from wordcloud import WordCloud
|
7 |
+
import os
|
8 |
+
|
9 |
+
def save_feedback_og(feedback):
|
10 |
+
|
11 |
+
feedback_file = 'feedback_data.json'
|
12 |
+
if os.path.exists(feedback_file):
|
13 |
+
with open(feedback_file, 'r') as f:
|
14 |
+
feedback_data = json.load(f)
|
15 |
+
else:
|
16 |
+
feedback_data = []
|
17 |
+
# tpl = {
|
18 |
+
# 'question' : question,
|
19 |
+
# 'answer' : answer,
|
20 |
+
# 'context' : context,
|
21 |
+
# 'options' : options,
|
22 |
+
# 'rating' : rating,
|
23 |
+
# }
|
24 |
+
# feedback_data[question] = rating
|
25 |
+
feedback_data.append(feedback)
|
26 |
+
|
27 |
+
print(feedback_data)
|
28 |
+
with open(feedback_file, 'w') as f:
|
29 |
+
json.dump(feedback_data, f)
|
30 |
+
st.session_state.feedback_data.append(feedback)
|
31 |
+
return feedback_file
|
32 |
+
|
33 |
+
def collect_feedback(i,question, answer, context, options):
|
34 |
+
st.write("Please provide feedback for this question:")
|
35 |
+
edited_question = st.text_input("Enter improved question",value=question,key=f'fdx1{i}')
|
36 |
+
clarity = st.slider("Clarity", 1, 5, 3, help="1 = Very unclear, 5 = Very clear",key=f'fdx2{i}')
|
37 |
+
difficulty = st.slider("Difficulty", 1, 5, 3, help="1 = Very easy, 5 = Very difficult",key=f'fdx3{i}')
|
38 |
+
relevance = st.slider("Relevance", 1, 5, 3, help="1 = Not relevant, 5 = Highly relevant",key=f'fdx4{i}')
|
39 |
+
option_quality = st.slider("Quality of Options", 1, 5, 3, help="1 = Poor options, 5 = Excellent options",key=f'fdx5{i}')
|
40 |
+
overall_rating = st.slider("Overall Rating", 1, 5, 3, help="1 = Poor, 5 = Excellent",key=f'fdx6{i}')
|
41 |
+
comments = st.text_input("Additional Comments", "",key=f'fdx7{i}')
|
42 |
+
|
43 |
+
if st.button("Submit Feedback",key=f'fdx8{i}'):
|
44 |
+
feedback = {
|
45 |
+
"context": context,
|
46 |
+
"question": question,
|
47 |
+
'edited_question':edited_question,
|
48 |
+
"answer": answer,
|
49 |
+
"options": options,
|
50 |
+
"clarity": clarity,
|
51 |
+
"difficulty": difficulty,
|
52 |
+
"relevance": relevance,
|
53 |
+
"option_quality": option_quality,
|
54 |
+
"overall_rating": overall_rating,
|
55 |
+
"comments": comments
|
56 |
+
}
|
57 |
+
# save_feedback(feedback)
|
58 |
+
save_feedback_og(feedback)
|
59 |
+
|
60 |
+
st.success("Thank you for your feedback!")
|
61 |
+
|
62 |
+
def analyze_feedback():
|
63 |
+
if not st.session_state.feedback_data:
|
64 |
+
st.warning("No feedback data available yet.")
|
65 |
+
return
|
66 |
+
|
67 |
+
df = pd.DataFrame(st.session_state.feedback_data)
|
68 |
+
|
69 |
+
st.write("Feedback Analysis")
|
70 |
+
st.write(f"Total feedback collected: {len(df)}")
|
71 |
+
|
72 |
+
metrics = ['clarity', 'difficulty', 'relevance', 'option_quality', 'overall_rating']
|
73 |
+
|
74 |
+
for metric in metrics:
|
75 |
+
fig, ax = plt.subplots()
|
76 |
+
df[metric].value_counts().sort_index().plot(kind='bar', ax=ax)
|
77 |
+
plt.title(f"Distribution of {metric.capitalize()} Ratings")
|
78 |
+
plt.xlabel("Rating")
|
79 |
+
plt.ylabel("Count")
|
80 |
+
st.pyplot(fig)
|
81 |
+
|
82 |
+
st.write("Average Ratings:")
|
83 |
+
st.write(df[metrics].mean())
|
84 |
+
|
85 |
+
# Word cloud of comments
|
86 |
+
comments = " ".join(df['comments'])
|
87 |
+
if len(comments) > 1:
|
88 |
+
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(comments)
|
89 |
+
fig, ax = plt.subplots()
|
90 |
+
plt.imshow(wordcloud, interpolation='bilinear')
|
91 |
+
plt.axis("off")
|
92 |
+
st.pyplot(fig)
|
93 |
+
|
94 |
+
|
95 |
+
def export_feedback_data():
|
96 |
+
if not st.session_state.feedback_data:
|
97 |
+
st.warning("No feedback data available.")
|
98 |
+
return None
|
99 |
+
|
100 |
+
# Convert feedback data to JSON
|
101 |
+
json_data = json.dumps(st.session_state.feedback_data, indent=2)
|
102 |
+
|
103 |
+
# Create a BytesIO object
|
104 |
+
buffer = BytesIO()
|
105 |
+
buffer.write(json_data.encode())
|
106 |
+
buffer.seek(0)
|
107 |
+
|
108 |
+
return buffer
|
fill_in_the_blanks_generation.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
async def generate_fill_in_the_blank_questions(context,answer):
|
3 |
+
answerSize = len(answer)
|
4 |
+
replacedBlanks = ""
|
5 |
+
for i in range(answerSize):
|
6 |
+
replacedBlanks += "_"
|
7 |
+
blank_q = context.replace(answer,replacedBlanks)
|
8 |
+
return blank_q
|
keyword_extraction.py
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from nltk.corpus import stopwords
|
2 |
+
from rake_nltk import Rake
|
3 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
4 |
+
import spacy
|
5 |
+
from transformers import pipeline
|
6 |
+
from gliner import GLiNER
|
7 |
+
from load_models import load_nlp_models
|
8 |
+
|
9 |
+
nlp, s2v = load_nlp_models()
|
10 |
+
|
11 |
+
def filter_keywords(extracted_keywords):
|
12 |
+
unwanted_keywords =[
|
13 |
+
# Common punctuation marks
|
14 |
+
'.', ',', '!', '?', ':', ';', '-', '_', '(', ')', '[', ']', '{', '}',
|
15 |
+
'/', '\\', '|', '@', '#', '$', '%', '^', '&', '*', '+', '=', '<', '>',
|
16 |
+
'`', '~', '"', "'",
|
17 |
+
|
18 |
+
# Common contractions (if not already removed as stopwords)
|
19 |
+
"n't", "'s", "'m", "'re", "'ll", "'ve", "'d",
|
20 |
+
|
21 |
+
# Common abbreviations
|
22 |
+
'etc', 'eg', 'ie', 'ex', 'vs', 'viz',
|
23 |
+
|
24 |
+
'tbd', 'tba', # To be determined/announced
|
25 |
+
'na', 'n/a', # Not applicable
|
26 |
+
|
27 |
+
# Single characters
|
28 |
+
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
|
29 |
+
|
30 |
+
# HTML-related tags (if the text contains any HTML content)
|
31 |
+
'<html>', '</html>', '<body>', '</body>', '<head>', '</head>', '<div>', '</div>', '<p>', '</p>', '<br>', '<hr>', '<h1>', '</h1>', '<h2>', '</h2>', '<h3>', '</h3>',
|
32 |
+
|
33 |
+
# Random technical or common abbreviations that aren't meaningful keywords
|
34 |
+
'etc', 'e.g', 'i.e', 'vs', 'ex', 'vol', 'sec', 'pg', 'id', 'ref', 'eq',
|
35 |
+
|
36 |
+
# Miscellaneous tokens
|
37 |
+
'www', 'com', 'http', 'https', 'ftp', 'pdf', 'doc', 'img', 'gif', 'jpeg', 'jpg', 'png', 'mp4', 'mp3', 'org', 'net', 'edu',
|
38 |
+
'untitled', 'noname', 'unknown', 'undefined',
|
39 |
+
|
40 |
+
# Single letters commonly used in bullet points or references
|
41 |
+
'i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii',
|
42 |
+
|
43 |
+
# Common file extensions (if filenames are included in the text)
|
44 |
+
'.jpg', '.png', '.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.csv', '.txt', '.zip', '.tar', '.gz', '.exe', '.bat', '.sh', '.py', '.cpp', '.java',
|
45 |
+
|
46 |
+
# Other tokens related to formatting or structure
|
47 |
+
'chapter', 'section', 'figure', 'table', 'appendix',
|
48 |
+
|
49 |
+
# Miscellaneous general noise terms
|
50 |
+
'note', 'item', 'items', 'number', 'numbers', 'figure', 'case', 'cases', 'example', 'examples', 'type', 'types', 'section', 'part', 'parts'
|
51 |
+
]
|
52 |
+
# Convert both lists to sets for efficient lookup
|
53 |
+
extracted_set = set(extracted_keywords)
|
54 |
+
unwanted_set = set(unwanted_keywords)
|
55 |
+
|
56 |
+
# Remove unwanted keywords
|
57 |
+
filtered_keywords = extracted_set - unwanted_set
|
58 |
+
|
59 |
+
# Convert back to a list and sort (optional)
|
60 |
+
return sorted(list(filtered_keywords))
|
61 |
+
|
62 |
+
|
63 |
+
def remove_stopwords(keywords):
|
64 |
+
stop_words = set(stopwords.words('english'))
|
65 |
+
modified_keywords = [''.join(keyword.split()) for keyword in keywords]
|
66 |
+
filtered_keywords = [keyword for keyword in modified_keywords if keyword.lower() not in stop_words]
|
67 |
+
original_keywords = []
|
68 |
+
for keyword in filtered_keywords:
|
69 |
+
for original_keyword in keywords:
|
70 |
+
if ''.join(original_keyword.split()).lower() == keyword.lower():
|
71 |
+
original_keywords.append(original_keyword)
|
72 |
+
break
|
73 |
+
return original_keywords
|
74 |
+
|
75 |
+
def enhanced_ner(text):
|
76 |
+
nlp = spacy.load("en_core_web_trf")
|
77 |
+
ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")
|
78 |
+
doc = nlp(text)
|
79 |
+
spacy_entities = set((ent.text, ent.label_) for ent in doc.ents)
|
80 |
+
hf_entities = set((ent['word'], ent['entity']) for ent in ner_pipeline(text))
|
81 |
+
combined_entities = spacy_entities.union(hf_entities)
|
82 |
+
keywords = [entity[0] for entity in combined_entities]
|
83 |
+
return list(keywords)
|
84 |
+
|
85 |
+
def extract_keywords(text, extract_all):
|
86 |
+
try:
|
87 |
+
text = text.lower()
|
88 |
+
enhanced_ner_entities = enhanced_ner(text)
|
89 |
+
print("Enhanced ner entities: ",enhanced_ner_entities)
|
90 |
+
enhanced_ner_entities = remove_stopwords(enhanced_ner_entities)
|
91 |
+
enhanced_ner_entities = filter_keywords(enhanced_ner_entities)
|
92 |
+
print("Enhanced ner entities after applying filter and stopwords removal: ",enhanced_ner_entities)
|
93 |
+
|
94 |
+
gliner_model = GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5")
|
95 |
+
labels = ["person", "organization", "phone number", "address", "email", "date of birth",
|
96 |
+
"mobile phone number", "medication", "ip address", "email address",
|
97 |
+
"landline phone number", "blood type", "digital signature", "postal code",
|
98 |
+
"date"]
|
99 |
+
entities = gliner_model.predict_entities(text, labels, threshold=0.5)
|
100 |
+
|
101 |
+
gliner_keywords = set(remove_stopwords([ent["text"] for ent in entities]))
|
102 |
+
print(f"Gliner keywords:{gliner_keywords}")
|
103 |
+
|
104 |
+
# if extract_all is False:
|
105 |
+
# return list(gliner_keywords)
|
106 |
+
|
107 |
+
doc = nlp(text)
|
108 |
+
spacy_keywords = set(remove_stopwords([ent.text for ent in doc.ents]))
|
109 |
+
print(f"\n\nSpacy Entities: {spacy_keywords} \n\n")
|
110 |
+
|
111 |
+
if extract_all is False:
|
112 |
+
combined_keywords_without_all = list(spacy_keywords.union(gliner_keywords).union(enhanced_ner_entities))
|
113 |
+
filtered_results = filter_keywords(combined_keywords_without_all)
|
114 |
+
print("Keywords returned: ",filtered_results)
|
115 |
+
return list(filtered_results)
|
116 |
+
|
117 |
+
rake = Rake()
|
118 |
+
rake.extract_keywords_from_text(text)
|
119 |
+
rake_keywords = set(remove_stopwords(rake.get_ranked_phrases()))
|
120 |
+
print(f"\n\nRake Keywords: {rake_keywords} \n\n")
|
121 |
+
|
122 |
+
vectorizer = TfidfVectorizer(stop_words='english')
|
123 |
+
X = vectorizer.fit_transform([text])
|
124 |
+
tfidf_keywords = set(remove_stopwords(vectorizer.get_feature_names_out()))
|
125 |
+
print(f"\n\nTFIDF Entities: {tfidf_keywords} \n\n")
|
126 |
+
|
127 |
+
combined_keywords = list(rake_keywords.union(spacy_keywords).union(tfidf_keywords).union(gliner_keywords))
|
128 |
+
filtered_results = filter_keywords(combined_keywords)
|
129 |
+
print("Keywords returned: ",filtered_results)
|
130 |
+
return list(filtered_results)
|
131 |
+
|
132 |
+
except Exception as e:
|
133 |
+
raise Exception(f"Error in keyword extraction: {str(e)}")
|
load_models.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from transformers import T5ForConditionalGeneration, T5Tokenizer
|
3 |
+
import spacy
|
4 |
+
import sense2vec
|
5 |
+
from sentence_transformers import SentenceTransformer
|
6 |
+
from spellchecker import SpellChecker
|
7 |
+
import wikipediaapi
|
8 |
+
from langchain_community.llms import Ollama
|
9 |
+
# import time
|
10 |
+
|
11 |
+
def load_llama():
|
12 |
+
llm = Ollama(model='llama3:latest')
|
13 |
+
return llm
|
14 |
+
|
15 |
+
@st.cache_resource
|
16 |
+
def load_model(modelname):
|
17 |
+
model_name = modelname
|
18 |
+
model = T5ForConditionalGeneration.from_pretrained(model_name)
|
19 |
+
tokenizer = T5Tokenizer.from_pretrained(model_name)
|
20 |
+
return model, tokenizer
|
21 |
+
|
22 |
+
# Load Spacy Model
|
23 |
+
@st.cache_resource
|
24 |
+
def load_nlp_models():
|
25 |
+
nlp = spacy.load("en_core_web_md")
|
26 |
+
s2v = sense2vec.Sense2Vec().from_disk('s2v_old')
|
27 |
+
return nlp, s2v
|
28 |
+
|
29 |
+
# Load Quality Assurance Models
|
30 |
+
@st.cache_resource
|
31 |
+
def load_qa_models():
|
32 |
+
# Initialize BERT model for sentence similarity
|
33 |
+
similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
|
34 |
+
|
35 |
+
spell = SpellChecker()
|
36 |
+
return similarity_model, spell
|
37 |
+
|
38 |
+
def initialize_wikiapi():
|
39 |
+
# Initialize Wikipedia API with a user agent
|
40 |
+
user_agent = 'QGen/1.2'
|
41 |
+
wiki_wiki = wikipediaapi.Wikipedia(user_agent= user_agent,language='en')
|
42 |
+
return user_agent, wiki_wiki
|
43 |
+
|
44 |
+
|
45 |
+
|
mapping_keywords.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from nltk.tokenize import sent_tokenize
|
2 |
+
|
3 |
+
# Function to map keywords to sentences with customizable context window size
|
4 |
+
def map_keywords_to_sentences(text, keywords, context_window_size):
|
5 |
+
sentences = sent_tokenize(text)
|
6 |
+
keyword_sentence_mapping = {}
|
7 |
+
print(f"\n\nSentences: {sentences}\n\n")
|
8 |
+
for keyword in keywords:
|
9 |
+
for i, sentence in enumerate(sentences):
|
10 |
+
if keyword in sentence:
|
11 |
+
# Combine current sentence with surrounding sentences for context
|
12 |
+
# start = max(0, i - context_window_size)
|
13 |
+
# end = min(len(sentences), i + context_window_size + 1)
|
14 |
+
start = max(0,i - context_window_size)
|
15 |
+
context_sentenses = sentences[start:i+1]
|
16 |
+
context = ' '.join(context_sentenses)
|
17 |
+
# context = ' '.join(sentences[start:end])
|
18 |
+
if keyword not in keyword_sentence_mapping:
|
19 |
+
keyword_sentence_mapping[keyword] = context
|
20 |
+
else:
|
21 |
+
keyword_sentence_mapping[keyword] += ' ' + context
|
22 |
+
return keyword_sentence_mapping
|
option_generation.py
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import nltk
|
2 |
+
import random
|
3 |
+
import asyncio
|
4 |
+
nltk.download('wordnet')
|
5 |
+
from nltk.corpus import wordnet
|
6 |
+
from sentence_transformers import util
|
7 |
+
from load_models import load_nlp_models, load_llama, load_qa_models
|
8 |
+
from utils import QuestionGenerationError
|
9 |
+
|
10 |
+
nlp, s2v = load_nlp_models()
|
11 |
+
llm = load_llama()
|
12 |
+
similarity_model, spell = load_qa_models()
|
13 |
+
context_model = similarity_model
|
14 |
+
|
15 |
+
def get_similar_words_sense2vec(word, n=3):
|
16 |
+
# Try to find the word with its most likely part-of-speech
|
17 |
+
word_with_pos = word + "|NOUN"
|
18 |
+
if word_with_pos in s2v:
|
19 |
+
similar_words = s2v.most_similar(word_with_pos, n=n)
|
20 |
+
return [word.split("|")[0] for word, _ in similar_words]
|
21 |
+
|
22 |
+
# If not found, try without POS
|
23 |
+
if word in s2v:
|
24 |
+
similar_words = s2v.most_similar(word, n=n)
|
25 |
+
return [word.split("|")[0] for word, _ in similar_words]
|
26 |
+
|
27 |
+
return []
|
28 |
+
|
29 |
+
def get_synonyms(word, n=3):
|
30 |
+
synonyms = []
|
31 |
+
for syn in wordnet.synsets(word):
|
32 |
+
for lemma in syn.lemmas():
|
33 |
+
if lemma.name() != word and lemma.name() not in synonyms:
|
34 |
+
synonyms.append(lemma.name())
|
35 |
+
if len(synonyms) == n:
|
36 |
+
return synonyms
|
37 |
+
return synonyms
|
38 |
+
|
39 |
+
def gen_options(answer,context,question):
|
40 |
+
prompt=f'''Given the following context, question, and correct answer,
|
41 |
+
generate {4} incorrect but plausible answer options. The options should be:
|
42 |
+
1. Contextually related to the given context
|
43 |
+
2. Grammatically consistent with the question
|
44 |
+
3. Different from the correct answer
|
45 |
+
4. Not explicitly mentioned in the given context
|
46 |
+
|
47 |
+
Context: {context}
|
48 |
+
Question: {question}
|
49 |
+
Correct Answer: {answer}
|
50 |
+
|
51 |
+
Provide the options in a semi colon-separated list. Output must contain only the options and nothing else.
|
52 |
+
'''
|
53 |
+
options= [answer]
|
54 |
+
response = llm.invoke(prompt, stop=['<|eot_id|>'])
|
55 |
+
incorrect_options = [option.strip() for option in response.split(';')]
|
56 |
+
options.extend(incorrect_options)
|
57 |
+
random.shuffle(options)
|
58 |
+
print(options)
|
59 |
+
return options
|
60 |
+
# print(response)
|
61 |
+
|
62 |
+
def generate_options(answer, context, n=3):
|
63 |
+
options = [answer]
|
64 |
+
|
65 |
+
# Add contextually relevant words using a pre-trained model
|
66 |
+
context_embedding = context_model.encode(context)
|
67 |
+
answer_embedding = context_model.encode(answer)
|
68 |
+
context_words = [token.text for token in nlp(context) if token.is_alpha and token.text.lower() != answer.lower()]
|
69 |
+
|
70 |
+
# Compute similarity scores and sort context words
|
71 |
+
similarity_scores = [util.pytorch_cos_sim(context_model.encode(word), answer_embedding).item() for word in context_words]
|
72 |
+
sorted_context_words = [word for _, word in sorted(zip(similarity_scores, context_words), reverse=True)]
|
73 |
+
options.extend(sorted_context_words[:n])
|
74 |
+
|
75 |
+
# Try to get similar words based on sense2vec
|
76 |
+
similar_words = get_similar_words_sense2vec(answer, n)
|
77 |
+
options.extend(similar_words)
|
78 |
+
|
79 |
+
# If we don't have enough options, try synonyms
|
80 |
+
if len(options) < n + 1:
|
81 |
+
synonyms = get_synonyms(answer, n - len(options) + 1)
|
82 |
+
options.extend(synonyms)
|
83 |
+
|
84 |
+
# If we still don't have enough options, extract other entities from the context
|
85 |
+
if len(options) < n + 1:
|
86 |
+
doc = nlp(context)
|
87 |
+
entities = [ent.text for ent in doc.ents if ent.text.lower() != answer.lower()]
|
88 |
+
options.extend(entities[:n - len(options) + 1])
|
89 |
+
|
90 |
+
# If we still need more options, add some random words from the context
|
91 |
+
if len(options) < n + 1:
|
92 |
+
context_words = [token.text for token in nlp(context) if token.is_alpha and token.text.lower() != answer.lower()]
|
93 |
+
options.extend(random.sample(context_words, min(n - len(options) + 1, len(context_words))))
|
94 |
+
print(f"\n\nAll Possible Options: {options}\n\n")
|
95 |
+
# Ensure we have the correct number of unique options
|
96 |
+
options = list(dict.fromkeys(options))[:n+1]
|
97 |
+
|
98 |
+
# Shuffle the options
|
99 |
+
random.shuffle(options)
|
100 |
+
|
101 |
+
return options
|
102 |
+
|
103 |
+
async def generate_options_async(answer, context, n=3):
|
104 |
+
try:
|
105 |
+
options = [answer]
|
106 |
+
|
107 |
+
# Add contextually relevant words using a pre-trained model
|
108 |
+
context_embedding = await asyncio.to_thread(context_model.encode, context)
|
109 |
+
answer_embedding = await asyncio.to_thread(context_model.encode, answer)
|
110 |
+
context_words = [token.text for token in nlp(context) if token.is_alpha and token.text.lower() != answer.lower()]
|
111 |
+
|
112 |
+
# Compute similarity scores and sort context words
|
113 |
+
similarity_scores = [util.pytorch_cos_sim(await asyncio.to_thread(context_model.encode, word), answer_embedding).item() for word in context_words]
|
114 |
+
sorted_context_words = [word for _, word in sorted(zip(similarity_scores, context_words), reverse=True)]
|
115 |
+
options.extend(sorted_context_words[:n])
|
116 |
+
|
117 |
+
# Try to get similar words based on sense2vec
|
118 |
+
similar_words = await asyncio.to_thread(get_similar_words_sense2vec, answer, n)
|
119 |
+
options.extend(similar_words)
|
120 |
+
|
121 |
+
# If we don't have enough options, try synonyms
|
122 |
+
if len(options) < n + 1:
|
123 |
+
synonyms = await asyncio.to_thread(get_synonyms, answer, n - len(options) + 1)
|
124 |
+
options.extend(synonyms)
|
125 |
+
|
126 |
+
# Ensure we have the correct number of unique options
|
127 |
+
options = list(dict.fromkeys(options))[:n+1]
|
128 |
+
|
129 |
+
# Shuffle the options
|
130 |
+
random.shuffle(options)
|
131 |
+
|
132 |
+
return options
|
133 |
+
except Exception as e:
|
134 |
+
raise QuestionGenerationError(f"Error in generating options: {str(e)}")
|
135 |
+
|
question_generation.py
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
import streamlit as st
|
3 |
+
from text_processing import segment_text
|
4 |
+
from keyword_extraction import extract_keywords
|
5 |
+
from utils import QuestionGenerationError
|
6 |
+
from mapping_keywords import map_keywords_to_sentences
|
7 |
+
from option_generation import gen_options, generate_options_async
|
8 |
+
from fill_in_the_blanks_generation import generate_fill_in_the_blank_questions
|
9 |
+
from load_models import load_nlp_models, load_qa_models, load_model
|
10 |
+
|
11 |
+
nlp, s2v = load_nlp_models()
|
12 |
+
similarity_model, spell = load_qa_models()
|
13 |
+
|
14 |
+
|
15 |
+
def assess_question_quality(context, question, answer):
|
16 |
+
# Assess relevance using cosine similarity
|
17 |
+
context_doc = nlp(context)
|
18 |
+
question_doc = nlp(question)
|
19 |
+
relevance_score = context_doc.similarity(question_doc)
|
20 |
+
|
21 |
+
# Assess complexity using token length (as a simple metric)
|
22 |
+
complexity_score = min(len(question_doc) / 20, 1) # Normalize to 0-1
|
23 |
+
|
24 |
+
# Assess Spelling correctness
|
25 |
+
misspelled = spell.unknown(question.split())
|
26 |
+
spelling_correctness = 1 - (len(misspelled) / len(question.split())) # Normalize to 0-1
|
27 |
+
|
28 |
+
# Calculate overall score (you can adjust weights as needed)
|
29 |
+
overall_score = (
|
30 |
+
0.4 * relevance_score +
|
31 |
+
0.4 * complexity_score +
|
32 |
+
0.2 * spelling_correctness
|
33 |
+
)
|
34 |
+
|
35 |
+
return overall_score, relevance_score, complexity_score, spelling_correctness
|
36 |
+
|
37 |
+
|
38 |
+
async def process_batch(batch, keywords, context_window_size, num_beams, num_questions, modelname):
|
39 |
+
questions = []
|
40 |
+
print("inside process batch function")
|
41 |
+
flag = False
|
42 |
+
for text in batch:
|
43 |
+
if flag:
|
44 |
+
break
|
45 |
+
keyword_sentence_mapping = map_keywords_to_sentences(text, keywords, context_window_size)
|
46 |
+
print(keyword_sentence_mapping)
|
47 |
+
for keyword, context in keyword_sentence_mapping.items():
|
48 |
+
print("Length of questions list from process batch function: ",len(questions))
|
49 |
+
if len(questions)>=num_questions:
|
50 |
+
flag = True
|
51 |
+
break
|
52 |
+
question = await generate_question_async(context, keyword, num_beams,modelname)
|
53 |
+
options = await generate_options_async(keyword, context)
|
54 |
+
# options = gen_options(keyword, context, question)
|
55 |
+
blank_question = await generate_fill_in_the_blank_questions(context,keyword)
|
56 |
+
overall_score, relevance_score, complexity_score, spelling_correctness = assess_question_quality(context, question, keyword)
|
57 |
+
if overall_score >= 0.5:
|
58 |
+
questions.append({
|
59 |
+
"question": question,
|
60 |
+
"context": context,
|
61 |
+
"answer": keyword,
|
62 |
+
"options": options,
|
63 |
+
"overall_score": overall_score,
|
64 |
+
"relevance_score": relevance_score,
|
65 |
+
"complexity_score": complexity_score,
|
66 |
+
"spelling_correctness": spelling_correctness,
|
67 |
+
"blank_question": blank_question,
|
68 |
+
})
|
69 |
+
return questions
|
70 |
+
|
71 |
+
|
72 |
+
async def generate_question_async(context, answer, num_beams,modelname):
|
73 |
+
model, tokenizer = load_model(modelname)
|
74 |
+
try:
|
75 |
+
input_text = f"<context> {context} <answer> {answer}"
|
76 |
+
print(f"\n{input_text}\n")
|
77 |
+
input_ids = tokenizer.encode(input_text, return_tensors='pt')
|
78 |
+
outputs = await asyncio.to_thread(model.generate, input_ids, num_beams=num_beams, early_stopping=True, max_length=250)
|
79 |
+
question = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
80 |
+
print(f"\n{question}\n")
|
81 |
+
# print(type(question))
|
82 |
+
return question
|
83 |
+
except Exception as e:
|
84 |
+
raise QuestionGenerationError(f"Error in question generation: {str(e)}")
|
85 |
+
|
86 |
+
# Function to generate questions using beam search
|
87 |
+
async def generate_questions_async(text, num_questions, context_window_size, num_beams, extract_all_keywords,modelname):
|
88 |
+
try:
|
89 |
+
batches = segment_text(text.lower())
|
90 |
+
keywords = extract_keywords(text, extract_all_keywords)
|
91 |
+
all_questions = []
|
92 |
+
|
93 |
+
progress_bar = st.progress(0)
|
94 |
+
status_text = st.empty()
|
95 |
+
print("Final keywords:",keywords)
|
96 |
+
print("Number of questions that needs to be generated: ",num_questions)
|
97 |
+
print("totoal no of batches:", batches)
|
98 |
+
for i, batch in enumerate(batches):
|
99 |
+
print("batch no: ", len(batches))
|
100 |
+
status_text.text(f"Processing batch {i+1} of {len(batches)}...")
|
101 |
+
batch_questions = await process_batch(batch, keywords, context_window_size, num_beams,num_questions,modelname)
|
102 |
+
all_questions.extend(batch_questions)
|
103 |
+
progress_bar.progress((i + 1) / len(batches))
|
104 |
+
|
105 |
+
print("Length of the all questions list: ",len(all_questions))
|
106 |
+
|
107 |
+
if len(all_questions) >= num_questions:
|
108 |
+
break
|
109 |
+
|
110 |
+
progress_bar.empty()
|
111 |
+
status_text.empty()
|
112 |
+
|
113 |
+
return all_questions[:num_questions]
|
114 |
+
except QuestionGenerationError as e:
|
115 |
+
st.error(f"An error occurred during question generation: {str(e)}")
|
116 |
+
return []
|
117 |
+
except Exception as e:
|
118 |
+
st.error(f"An unexpected error occurred: {str(e)}")
|
119 |
+
return []
|
120 |
+
|
121 |
+
|
122 |
+
|
text_processing.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import pymupdf
|
3 |
+
from nltk.tokenize import sent_tokenize
|
4 |
+
|
5 |
+
def get_pdf_text(pdf_file):
|
6 |
+
doc = pymupdf.open(stream=pdf_file.read(), filetype="pdf")
|
7 |
+
text = ""
|
8 |
+
for page_num in range(doc.page_count):
|
9 |
+
page = doc.load_page(page_num)
|
10 |
+
text += page.get_text()
|
11 |
+
return text
|
12 |
+
|
13 |
+
def clean_text(text):
|
14 |
+
text = re.sub(r"[^\x00-\x7F]", " ", text)
|
15 |
+
text = re.sub(r"[\n]", " ", text)
|
16 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
17 |
+
text = re.sub(r'[“”]', '"', text)
|
18 |
+
text = re.sub(r"[‘’]", "'", text)
|
19 |
+
text = text.replace('\xad', '')
|
20 |
+
text = re.sub(r'[‒–—―]', '-', text)
|
21 |
+
return text
|
22 |
+
|
23 |
+
# Function to create text chunks
|
24 |
+
def segment_text(text, max_segment_length=700, batch_size=7):
|
25 |
+
sentences = sent_tokenize(text)
|
26 |
+
segments = []
|
27 |
+
current_segment = ""
|
28 |
+
|
29 |
+
for sentence in sentences:
|
30 |
+
if len(current_segment) + len(sentence) <= max_segment_length:
|
31 |
+
current_segment += sentence + " "
|
32 |
+
else:
|
33 |
+
segments.append(current_segment.strip())
|
34 |
+
current_segment = sentence + " "
|
35 |
+
|
36 |
+
if current_segment:
|
37 |
+
segments.append(current_segment.strip())
|
38 |
+
|
39 |
+
# Create batches
|
40 |
+
batches = [segments[i:i + batch_size] for i in range(0, len(segments), batch_size)]
|
41 |
+
return batches
|
utils.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import uuid
|
3 |
+
from load_models import initialize_wikiapi
|
4 |
+
from functools import lru_cache
|
5 |
+
|
6 |
+
class QuestionGenerationError(Exception):
|
7 |
+
"""Custom exception for question generation errors."""
|
8 |
+
pass
|
9 |
+
|
10 |
+
def get_session_id():
|
11 |
+
if 'session_id' not in st.session_state:
|
12 |
+
st.session_state.session_id = str(uuid.uuid4())
|
13 |
+
return st.session_state.session_id
|
14 |
+
|
15 |
+
def initialize_state(session_id):
|
16 |
+
if 'session_states' not in st.session_state:
|
17 |
+
st.session_state.session_states = {}
|
18 |
+
|
19 |
+
if session_id not in st.session_state.session_states:
|
20 |
+
st.session_state.session_states[session_id] = {
|
21 |
+
'generated_questions': [],
|
22 |
+
# add other state variables as needed
|
23 |
+
}
|
24 |
+
return st.session_state.session_states[session_id]
|
25 |
+
|
26 |
+
def get_state(session_id):
|
27 |
+
return st.session_state.session_states[session_id]
|
28 |
+
|
29 |
+
def set_state(session_id, key, value):
|
30 |
+
st.session_state.session_states[session_id][key] = value
|
31 |
+
|
32 |
+
|
33 |
+
|
34 |
+
# Info Section
|
35 |
+
def display_info():
|
36 |
+
st.sidebar.title("Information")
|
37 |
+
st.sidebar.markdown("""
|
38 |
+
### Question Generator System
|
39 |
+
This system is designed to generate questions based on the provided context. It uses various NLP techniques and models to:
|
40 |
+
- Extract keywords from the text
|
41 |
+
- Map keywords to sentences
|
42 |
+
- Generate questions
|
43 |
+
- Provide multiple choice options
|
44 |
+
- Assess the quality of generated questions
|
45 |
+
|
46 |
+
#### Key Features:
|
47 |
+
- **Keyword Extraction:** Combines RAKE, TF-IDF, and spaCy for comprehensive keyword extraction.
|
48 |
+
- **Question Generation:** Utilizes a pre-trained T5 model for generating questions.
|
49 |
+
- **Options Generation:** Creates contextually relevant multiple-choice options.
|
50 |
+
- **Question Assessment:** Scores questions based on relevance, complexity, and spelling correctness.
|
51 |
+
- **Feedback Collection:** Allows users to rate the generated questions and provides statistics on feedback.
|
52 |
+
|
53 |
+
#### Customization Options:
|
54 |
+
- Number of beams for question generation
|
55 |
+
- Context window size for mapping keywords to sentences
|
56 |
+
- Number of questions to generate
|
57 |
+
- Additional display elements (context, answer, options, entity link, QA scores)
|
58 |
+
|
59 |
+
#### Outputs:
|
60 |
+
- Generated questions with multiple-choice options
|
61 |
+
- Download options for CSV and PDF formats
|
62 |
+
- Visualization of overall scores
|
63 |
+
|
64 |
+
""")
|
65 |
+
|
66 |
+
|
67 |
+
# Function to perform entity linking using Wikipedia API
|
68 |
+
@lru_cache(maxsize=128)
|
69 |
+
def entity_linking(keyword):
|
70 |
+
user_agent, wiki_wiki = initialize_wikiapi()
|
71 |
+
page = wiki_wiki.page(keyword)
|
72 |
+
if page.exists():
|
73 |
+
return page.fullurl
|
74 |
+
return None
|
75 |
+
|
visualization.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from wordcloud import WordCloud
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
import streamlit as st
|
4 |
+
|
5 |
+
def display_word_cloud(generated_questions):
|
6 |
+
word_frequency = {}
|
7 |
+
for question in generated_questions:
|
8 |
+
words = question.split()
|
9 |
+
for word in words:
|
10 |
+
word_frequency[word] = word_frequency.get(word, 0) + 1
|
11 |
+
|
12 |
+
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_frequency)
|
13 |
+
plt.figure(figsize=(10, 5))
|
14 |
+
plt.imshow(wordcloud, interpolation='bilinear')
|
15 |
+
plt.axis('off')
|
16 |
+
st.pyplot()
|