import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string

import gensim
from gensim.utils import simple_preprocess

import pickle
import re
import pyLDAvis
import pyLDAvis.gensim

import matplotlib.pyplot as plt
import pandas as pd

import re
from pprint import pprint
import gensim.corpora as corpora
import spacy

import datetime
import os

import gradio as gr

def preprocess_response(response):
    # Remove punctuation
    response = re.sub(r'[,\.!?]', '', response)
    # Convert to lowercase
    response = response.lower()
    return response

def preprocess_responses(responses):
    preprocessed_responses = [preprocess_response(response) for response in responses]
    return preprocessed_responses

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

def make_bigram_trigram_models(data_words):
    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    return bigram_mod, trigram_mod

def make_bigrams(texts, bigram_mod):
    return [bigram_mod[doc] for doc in texts]

def remove_stopwords(texts, stop_words):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def lemmatization(texts, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

def process_text(texts, stop_words, bigram_mod, nlp):
    # Remove Stop Words
    data_words_nostops = remove_stopwords(texts, stop_words)

    # Form Bigrams
    data_words_bigrams = make_bigrams(data_words_nostops, bigram_mod)

    # Do lemmatization keeping only noun, adj, vb, adv
    data_lemmatized = lemmatization(data_words_bigrams, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    return data_lemmatized

def create_lda_model(corpus, id2word, num_topics=10):
    lda_model = gensim.models.LdaModel(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)
    return lda_model

def print_topics(lda_model):
    #topics = pprint(lda_model.print_topics())
    topics = lda_model.print_topics()
    return topics

def visualize_lda_model(lda_model, corpus, id2word):
    p = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    return p

def topic_modeling(reviews):
    # Split the input string into individual reviews
    responses = reviews.split("\n")
    
    # Preprocess responses
    preprocessed_responses = preprocess_responses(responses)

    # Convert responses to words
    data_words = list(sent_to_words(preprocessed_responses))

    # Create stop words list
    stop_words = stopwords.words('english')
    stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

    # Create bigram and trigram models
    bigram_mod, trigram_mod = make_bigram_trigram_models(data_words)

    # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
    nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

    # Process text (remove stopwords, make bigrams, lemmatize)
    processed_data = process_text(data_words, stop_words, bigram_mod, nlp)

    # Create Dictionary and Corpus
    id2word = corpora.Dictionary(processed_data)
    corpus = [id2word.doc2bow(text) for text in processed_data]

    # Create LDA model
    lda_model = create_lda_model(corpus, id2word, num_topics=10)

    # Print topics
    topics = print_topics(lda_model)
    
    # Visualize LDA model
    visualization = visualize_lda_model(lda_model, corpus, id2word)

    visualization_html = pyLDAvis.prepared_data_to_html(visualization)

    current_dir = os.path.dirname(__file__)  # Get the current working directory
    filename = f"lda_visualization.html"  # Define the filename
    filepath = os.path.join(current_dir, filename)  # Combine directory and filename
    with open(filepath, "w", encoding="utf-8") as file:
        file.write(visualization_html)  # Write the HTML data to the file
    print("Successfully saved in", filename)


    return topics, visualization_html

# Interface
iface = gr.Interface(
    fn=topic_modeling, 
    inputs="text",
    # outputs=["text", "html"]
    outputs=[
    gr.Textbox(label="Topics"),
    gr.HTML(label="Visualization")
]
)

iface.launch(share=True)