import nltk nltk.download('stopwords') nltk.download('wordnet') from nltk.corpus import stopwords from nltk.stem.wordnet import WordNetLemmatizer import string import gensim from gensim.utils import simple_preprocess import pickle import re import pyLDAvis import pyLDAvis.gensim import matplotlib.pyplot as plt import pandas as pd import re from pprint import pprint import gensim.corpora as corpora import spacy import datetime import os import gradio as gr def preprocess_response(response): # Remove punctuation response = re.sub(r'[,\.!?]', '', response) # Convert to lowercase response = response.lower() return response def preprocess_responses(responses): preprocessed_responses = [preprocess_response(response) for response in responses] return preprocessed_responses def sent_to_words(sentences): for sentence in sentences: yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations def make_bigram_trigram_models(data_words): # Build the bigram and trigram models bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases. trigram = gensim.models.Phrases(bigram[data_words], threshold=100) # Faster way to get a sentence clubbed as a trigram/bigram bigram_mod = gensim.models.phrases.Phraser(bigram) trigram_mod = gensim.models.phrases.Phraser(trigram) return bigram_mod, trigram_mod def make_bigrams(texts, bigram_mod): return [bigram_mod[doc] for doc in texts] def remove_stopwords(texts, stop_words): return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts] def lemmatization(texts, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): """https://spacy.io/api/annotation""" texts_out = [] for sent in texts: doc = nlp(" ".join(sent)) texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags]) return texts_out def process_text(texts, stop_words, bigram_mod, nlp): # Remove Stop Words data_words_nostops = remove_stopwords(texts, stop_words) # Form Bigrams data_words_bigrams = make_bigrams(data_words_nostops, bigram_mod) # Do lemmatization keeping only noun, adj, vb, adv data_lemmatized = lemmatization(data_words_bigrams, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) return data_lemmatized def create_lda_model(corpus, id2word, num_topics=10): lda_model = gensim.models.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=100, chunksize=100, passes=10, per_word_topics=True) return lda_model def print_topics(lda_model): #topics = pprint(lda_model.print_topics()) topics = lda_model.print_topics() return topics def visualize_lda_model(lda_model, corpus, id2word): p = pyLDAvis.gensim.prepare(lda_model, corpus, id2word) return p def topic_modeling(reviews): # Split the input string into individual reviews responses = reviews.split("\n") # Preprocess responses preprocessed_responses = preprocess_responses(responses) # Convert responses to words data_words = list(sent_to_words(preprocessed_responses)) # Create stop words list stop_words = stopwords.words('english') stop_words.extend(['from', 'subject', 're', 'edu', 'use']) # Create bigram and trigram models bigram_mod, trigram_mod = make_bigram_trigram_models(data_words) # Initialize spacy 'en' model, keeping only tagger component (for efficiency) nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner']) # Process text (remove stopwords, make bigrams, lemmatize) processed_data = process_text(data_words, stop_words, bigram_mod, nlp) # Create Dictionary and Corpus id2word = corpora.Dictionary(processed_data) corpus = [id2word.doc2bow(text) for text in processed_data] # Create LDA model lda_model = create_lda_model(corpus, id2word, num_topics=10) # Print topics topics = print_topics(lda_model) # Visualize LDA model visualization = visualize_lda_model(lda_model, corpus, id2word) visualization_html = pyLDAvis.prepared_data_to_html(visualization) current_dir = os.path.dirname(__file__) # Get the current working directory filename = f"lda_visualization.html" # Define the filename filepath = os.path.join(current_dir, filename) # Combine directory and filename with open(filepath, "w", encoding="utf-8") as file: file.write(visualization_html) # Write the HTML data to the file print("Successfully saved in", filename) return topics, visualization_html # Interface iface = gr.Interface( fn=topic_modeling, inputs="text", # outputs=["text", "html"] outputs=[ gr.Textbox(label="Topics"), gr.HTML(label="Visualization") ] ) iface.launch(share=True)