import streamlit as st
from transformers import T5ForConditionalGeneration, T5Tokenizer
import spacy
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from rake_nltk import Rake
import pandas as pd
from fpdf import FPDF
import wikipediaapi
from functools import lru_cache
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('brown')
from nltk.tokenize import sent_tokenize
nltk.download('wordnet')
from nltk.corpus import wordnet
import random
from sense2vec import Sense2Vec
import sense2vec
from wordcloud import WordCloud
import matplotlib.pyplot as plt
print("***************************************************************")

st.set_page_config(
    page_title="Question Generator",
    initial_sidebar_state="collapsed",
)
# Load spaCy model
nlp = spacy.load("en_core_web_md")
# s2v = Sense2Vec.from_disk(self=Sense2Vec,path='s2v_old')

s2v = sense2vec.Sense2Vec().from_disk('s2v_old')
# Initialize Wikipedia API with a user agent
user_agent = 'QGen/1.0 (channingfisher7@gmail.com)'
wiki_wiki = wikipediaapi.Wikipedia(user_agent= user_agent,language='en')

@st.cache_resource
def load_model():
    model_name = "DevBM/t5-large-squad"
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    return model, tokenizer

# Function to extract keywords using combined techniques
def extract_keywords(text, extract_all):
    doc = nlp(text)
    spacy_keywords = set([ent.text for ent in doc.ents])
    spacy_entities = spacy_keywords
    print(f"\n\nSpacy Entities: {spacy_entities} \n\n")  

    # Use Only Spacy Entities
    if extract_all is False:
        return list(spacy_entities) 
    
    # Use RAKE
    rake = Rake()
    rake.extract_keywords_from_text(text)
    rake_keywords = set(rake.get_ranked_phrases())
    print(f"\n\nRake Keywords: {rake_keywords} \n\n")
    # Use spaCy for NER and POS tagging
    spacy_keywords.update([token.text for token in doc if token.pos_ in ["NOUN", "PROPN", "VERB", "ADJ"]])
    print(f"\n\nSpacy Keywords: {spacy_keywords} \n\n")
    # Use TF-IDF
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform([text])
    tfidf_keywords = set(vectorizer.get_feature_names_out())
    print(f"\n\nTFIDF Entities: {tfidf_keywords} \n\n")

    # Combine all keywords
    combined_keywords = rake_keywords.union(spacy_keywords).union(tfidf_keywords)
    
    return list(combined_keywords)

def get_similar_words_sense2vec(word, n=3):
    # Try to find the word with its most likely part-of-speech
    word_with_pos = word + "|NOUN"
    if word_with_pos in s2v:
        similar_words = s2v.most_similar(word_with_pos, n=n)
        return [word.split("|")[0] for word, _ in similar_words]
    
    # If not found, try without POS
    if word in s2v:
        similar_words = s2v.most_similar(word, n=n)
        return [word.split("|")[0] for word, _ in similar_words]
    
    return []

def get_synonyms(word, n=3):
    synonyms = []
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            if lemma.name() != word and lemma.name() not in synonyms:
                synonyms.append(lemma.name())
                if len(synonyms) == n:
                    return synonyms
    return synonyms

def generate_options(answer, context, n=3):
    options = [answer]
    
    # Try to get similar words based on sense2vec
    similar_words = get_similar_words_sense2vec(answer, n)
    options.extend(similar_words)
    
    # If we don't have enough options, try synonyms
    if len(options) < n + 1:
        synonyms = get_synonyms(answer, n - len(options) + 1)
        options.extend(synonyms)
    
    # If we still don't have enough options, extract other entities from the context
    if len(options) < n + 1:
        doc = nlp(context)
        entities = [ent.text for ent in doc.ents if ent.text.lower() != answer.lower()]
        options.extend(entities[:n - len(options) + 1])
    
    # If we still need more options, add some random words from the context
    if len(options) < n + 1:
        context_words = [token.text for token in nlp(context) if token.is_alpha and token.text.lower() != answer.lower()]
        options.extend(random.sample(context_words, min(n - len(options) + 1, len(context_words))))
    
    # Ensure we have the correct number of unique options
    options = list(dict.fromkeys(options))[:n+1]
    
    # Shuffle the options
    random.shuffle(options)
    
    return options

# Function to map keywords to sentences with customizable context window size
def map_keywords_to_sentences(text, keywords, context_window_size):
    sentences = sent_tokenize(text)
    keyword_sentence_mapping = {}
    for keyword in keywords:
        for i, sentence in enumerate(sentences):
            if keyword in sentence:
                # Combine current sentence with surrounding sentences for context
                start = max(0, i - context_window_size)
                end = min(len(sentences), i + context_window_size + 1)
                context = ' '.join(sentences[start:end])
                if keyword not in keyword_sentence_mapping:
                    keyword_sentence_mapping[keyword] = context
                else:
                    keyword_sentence_mapping[keyword] += ' ' + context
    return keyword_sentence_mapping


# Function to perform entity linking using Wikipedia API
@lru_cache(maxsize=128)
def entity_linking(keyword):
    page = wiki_wiki.page(keyword)
    if page.exists():
        return page.fullurl
    return None

# Function to generate questions using beam search
def generate_question(context, answer, num_beams):
    input_text = f"<context> {context} <answer> {answer}"
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    outputs = model.generate(input_ids, num_beams=num_beams, early_stopping=True)
    question = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return question

# Function to export questions to CSV
def export_to_csv(data):
    df = pd.DataFrame(data, columns=["Context", "Answer", "Question", "Options"])
    csv = df.to_csv(index=False,encoding='utf-8')
    return csv

# Function to export questions to PDF
def export_to_pdf(data):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    
    for context, answer, question, options in data:
        pdf.multi_cell(0, 10, f"Context: {context}")
        pdf.multi_cell(0, 10, f"Answer: {answer}")
        pdf.multi_cell(0, 10, f"Question: {question}")
        pdf.ln(10)
    
    # pdf.output("questions.pdf")
    return pdf.output(name='questions.pdf',dest='S').encode('latin1')

def display_word_cloud(generated_questions):
    word_frequency = {}
    for question in generated_questions:
        words = question.split()
        for word in words:
            word_frequency[word] = word_frequency.get(word, 0) + 1

    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_frequency)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    st.pyplot()

if 'data' not in st.session_state:
    st.session_state.data = None

# Streamlit interface
st.title(":blue[Question Generator from Text]")
text = st.text_area("Enter text here:", value="Joe Biden, the current US president is on a weak wicket going in for his reelection later this November against former President Donald Trump.")

with st.sidebar:
    st.subheader("Customization Options")
    # Customization options
    num_beams = st.slider("Select number of beams for question generation", min_value=1, max_value=10, value=5)
    context_window_size = st.slider("Select context window size (number of sentences before and after)", min_value=1, max_value=5, value=1)
    num_questions = st.slider("Select number of questions to generate", min_value=1, max_value=1000, value=5)
    with st.expander("Choose the Additional Elements to show"):
        show_context = st.checkbox("Context",True)
        show_answer = st.checkbox("Answer",True)
        show_options = st.checkbox("Options",False)
        show_entity_link = st.checkbox("Enitity Link For Wikipedia",True)
    extract_all_keywords = st.toggle("Extract max Keywords",value=False)

if st.button("Generate Questions"):
    if text:
        model, tokenizer = load_model()
        keywords = extract_keywords(text,extract_all_keywords)
        print(f"\n\nFinal Keywords in Main Function: {keywords}\n\n")
        keyword_sentence_mapping = map_keywords_to_sentences(text, keywords, context_window_size)
        
        st.subheader("Generated Questions:",divider='blue')
        data = []
        for i, (keyword, context) in enumerate(keyword_sentence_mapping.items()):
            if i >= num_questions:
                break
            linked_entity = entity_linking(keyword)
            question = generate_question(context, keyword, num_beams=num_beams)
            options = generate_options(keyword, context)
            st.subheader(body=f":orange[Q{i+1}:] {question}")

            if show_context is True:
                st.write(f"**Context:** {context}")
            if show_answer is True:
                st.write(f"**Answer:** {keyword}")
            if show_options is True:
                st.write(f"**Options:**")
                for j, option in enumerate(options):
                    st.write(f"{chr(65+j)}. {option}")
            if show_entity_link is True:
                if linked_entity:
                    st.write(f"**Entity Link:** {linked_entity}")
            st.write("---")
            data.append((context, keyword, question, options))

        # Add the data to session state
        st.session_state.data = data
        # display_word_cloud()
        print(data)
        # Export buttons
        if st.session_state.data is not None:
            with st.sidebar:
                st.subheader('Download Content')
                csv_data = export_to_csv(data)
                st.download_button(label="CSV Format", data=csv_data, file_name='questions.csv', mime='text/csv')

                pdf_data = export_to_pdf(data)
                st.download_button(label="PDF Format", data=pdf_data, file_name='questions.pdf', mime='application/pdf')
        if st.session_state.data is not None:
            st.markdown("You can download the data from the sidebar.")

    
    else:
        st.write("Please enter some text to generate questions.")
    print("********************************************************************************")