Spaces:

nibbz2024
/

topic_modelling

Sleeping

File size: 5,088 Bytes

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

import gensim
from gensim.utils import simple_preprocess

import re
import pyLDAvis
import pyLDAvis.gensim

import pandas as pd

import re
import gensim.corpora as corpora
import spacy
import os

import gradio as gr


import spacy.cli
spacy.cli.download("en_core_web_sm")

def preprocess_response(response):
    # Remove punctuation
    response = re.sub(r'[,\.!?]', '', response)
    # Convert to lowercase
    response = response.lower()
    return response

def preprocess_responses(responses):
    preprocessed_responses = [preprocess_response(response) for response in responses]
    return preprocessed_responses

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

def make_bigram_trigram_models(data_words):
    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    return bigram_mod, trigram_mod

def make_bigrams(texts, bigram_mod):
    return [bigram_mod[doc] for doc in texts]

def remove_stopwords(texts, stop_words):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def lemmatization(texts, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

def process_text(texts, stop_words, bigram_mod, nlp):
    # Remove Stop Words
    data_words_nostops = remove_stopwords(texts, stop_words)

    # Form Bigrams
    data_words_bigrams = make_bigrams(data_words_nostops, bigram_mod)

    # Do lemmatization keeping only noun, adj, vb, adv
    data_lemmatized = lemmatization(data_words_bigrams, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    return data_lemmatized

def create_lda_model(corpus, id2word, num_topics=6):
    lda_model = gensim.models.LdaModel(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)
    return lda_model

def print_topics(lda_model):
    #topics = pprint(lda_model.print_topics())
    topics = lda_model.print_topics()
    return topics

def visualize_lda_model(lda_model, corpus, id2word):
    p = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    return p

def topic_modeling(reviews):
    # Split the input string into individual reviews
    responses = reviews.split("\n")
    
    # Preprocess responses
    preprocessed_responses = preprocess_responses(responses)

    # Convert responses to words
    data_words = list(sent_to_words(preprocessed_responses))

    # Create stop words list
    stop_words = stopwords.words('english')
    stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

    # Create bigram and trigram models
    bigram_mod, trigram_mod = make_bigram_trigram_models(data_words)

    # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
    nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

    # Process text (remove stopwords, make bigrams, lemmatize)
    processed_data = process_text(data_words, stop_words, bigram_mod, nlp)

    # Create Dictionary and Corpus
    id2word = corpora.Dictionary(processed_data)
    corpus = [id2word.doc2bow(text) for text in processed_data]

    # Create LDA model
    lda_model = create_lda_model(corpus, id2word, num_topics=6)

    # Print topics
    topics = print_topics(lda_model)
    
    # Visualize LDA model
    visualization = visualize_lda_model(lda_model, corpus, id2word)

    visualization_html = pyLDAvis.prepared_data_to_html(visualization)

    current_dir = os.path.dirname(__file__)  # Get the current working directory
    filename = f"lda_visualization.html"  # Define the filename
    filepath = os.path.join(current_dir, filename)  # Combine directory and filename
    with open(filepath, "w", encoding="utf-8") as file:
        file.write(visualization_html)  # Write the HTML data to the file
    print("Successfully saved in", filename)


    return topics, visualization_html

# Interface
iface = gr.Interface(
    fn=topic_modeling, 
    inputs="text",
    # outputs=["text", "html"]
    outputs=[
    gr.Textbox(label="Topics"),
    gr.HTML(label="Visualization")
]
)

iface.launch(share=True)

#rebuild