File size: 6,428 Bytes
99220ed
 
 
 
6b21734
 
 
 
 
61f283f
6b21734
9c6fa4d
225bf42
 
99220ed
 
 
 
 
225bf42
 
 
 
99220ed
 
 
 
 
6b21734
99220ed
6b21734
 
 
 
 
 
99220ed
6b21734
 
 
 
 
 
 
 
 
 
 
 
99220ed
6b21734
 
99220ed
 
 
 
 
 
6b21734
 
99220ed
 
 
 
 
 
 
6b21734
225bf42
 
 
 
 
6b21734
 
 
99220ed
 
6b21734
99220ed
 
 
6b21734
 
 
c64f061
56c280b
6b21734
 
 
 
 
 
 
 
 
 
 
 
 
56c280b
 
6b21734
99220ed
ef13efd
6b21734
 
 
 
 
 
 
56c280b
 
99220ed
 
 
6b21734
99220ed
 
6b21734
 
 
 
225bf42
6b21734
99220ed
 
 
225bf42
 
99220ed
6b21734
 
 
67293c8
ef13efd
3982c1e
 
 
 
67293c8
 
6b21734
ef13efd
3982c1e
 
 
 
67293c8
 
3982c1e
99220ed
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import streamlit as st
from transformers import T5ForConditionalGeneration, T5Tokenizer
import spacy
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from rake_nltk import Rake
import pandas as pd
from fpdf import FPDF
import wikipediaapi
#from b import b

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('brown')
from nltk.tokenize import sent_tokenize

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Initialize Wikipedia API with a user agent
user_agent = 'QGen/1.0 (channingfisher7@gmail.com)'
wiki_wiki = wikipediaapi.Wikipedia(user_agent= user_agent,language='en')

# Load T5 model and tokenizer
model_name = "DevBM/t5-large-squad"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Function to extract keywords using combined techniques
def extract_keywords(text):
    # Use RAKE
    rake = Rake()
    rake.extract_keywords_from_text(text)
    rake_keywords = set(rake.get_ranked_phrases())
    
    # Use spaCy for NER and POS tagging
    doc = nlp(text)
    spacy_keywords = set([ent.text for ent in doc.ents])
    spacy_keywords.update([token.text for token in doc if token.pos_ in ["NOUN", "PROPN", "VERB", "ADJ"]])
    
    # Use TF-IDF
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform([text])
    tfidf_keywords = set(vectorizer.get_feature_names_out())
    
    # Combine all keywords
    combined_keywords = rake_keywords.union(spacy_keywords).union(tfidf_keywords)
    
    return list(combined_keywords)

# Function to map keywords to sentences with customizable context window size
def map_keywords_to_sentences(text, keywords, context_window_size):
    sentences = sent_tokenize(text)
    keyword_sentence_mapping = {}
    for keyword in keywords:
        for i, sentence in enumerate(sentences):
            if keyword in sentence:
                # Combine current sentence with surrounding sentences for context
                start = max(0, i - context_window_size)
                end = min(len(sentences), i + context_window_size + 1)
                context = ' '.join(sentences[start:end])
                if keyword not in keyword_sentence_mapping:
                    keyword_sentence_mapping[keyword] = context
                else:
                    keyword_sentence_mapping[keyword] += ' ' + context
    return keyword_sentence_mapping

# Function to perform entity linking using Wikipedia API
def entity_linking(keyword):
    page = wiki_wiki.page(keyword)
    if page.exists():
        return page.fullurl
    return None

# Function to generate questions using beam search
def generate_question(context, answer, num_beams=5):
    input_text = f"<context> {context} <answer> {answer}"
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    outputs = model.generate(input_ids, num_beams=num_beams, early_stopping=True)
    question = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return question

# Function to export questions to CSV
def export_to_csv(data):
    df = pd.DataFrame(data, columns=["Context", "Answer", "Question"])
    csv = df.to_csv(index=False,encoding='utf-8')
    return csv

# Function to export questions to PDF
def export_to_pdf(data):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    
    for context, answer, question in data:
        pdf.multi_cell(0, 10, f"Context: {context}")
        pdf.multi_cell(0, 10, f"Answer: {answer}")
        pdf.multi_cell(0, 10, f"Question: {question}")
        pdf.ln(10)
    
    # pdf.output("questions.pdf")
    return pdf.output(name='questions.pdf',dest='S').encode('latin1')

# Streamlit interface
st.title(":blue[Question Generator from Text]")
text = st.text_area("Enter text here:", value="Joe Biden, the current US president is on a weak wicket going in for his reelection later this November against former President Donald Trump.")

# Customization options
num_beams = st.slider("Select number of beams for question generation", min_value=1, max_value=10, value=5)
context_window_size = st.slider("Select context window size (number of sentences before and after)", min_value=1, max_value=5, value=1)
num_questions = st.slider("Select number of questions to generate", min_value=1, max_value=1000, value=5)
question_complexity = st.selectbox("Select question complexity", ["Simple", "Intermediate", "Complex"])
downlaod_csv = st.toggle('Download CSV',value=True)
download_pdf = st.toggle('Download PDF',value=True)
if st.button("Generate Questions"):
    if text:
        keywords = extract_keywords(text)
        keyword_sentence_mapping = map_keywords_to_sentences(text, keywords, context_window_size)
        
        st.subheader("Generated Questions:")
        data = []
        for i, (keyword, context) in enumerate(keyword_sentence_mapping.items()):
            if i >= num_questions:
                break
            linked_entity = entity_linking(keyword)
            question = generate_question(context, keyword, num_beams=num_beams)
            st.write(f"**Context:** {context}")
            st.write(f"**Answer:** {keyword}")
            st.write(f"**Question:** {question}")
            if linked_entity:
                st.write(f"**Entity Link:** {linked_entity}")
            st.write("---")
            data.append((context, keyword, question))
        
        # Export buttons
        if data is not None:
        # if st.button("Export to CSV"):
        # if downlaod_csv:
        #     csv_data = export_to_csv(data)
        #     st.success("Questions exported to questions.csv")
        #     st.download_button(label="Download CSV", data=csv_data, file_name='questions.csv', mime='text/csv')
            csv_data = export_to_csv(data)
            st.download_button(label="Download CSV", data=csv_data, file_name='questions.csv', mime='text/csv')

        # if st.button("Export to PDF"):
        # if download_pdf:
        #     pdf_data = export_to_pdf(data)
        #     st.success("Questions exported to questions.pdf")
        #     st.download_button(label="Download PDF", data=pdf_data, file_name='questions.pdf', mime='application/pdf')
            pdf_data = export_to_pdf(data)
            st.download_button(label="Download PDF", data=pdf_data, file_name='questions.pdf', mime='application/pdf')

    else:
        st.write("Please enter some text to generate questions.")