import nltk nltk.download('stopwords') nltk.download('wordnet') from nltk.corpus import stopwords from nltk.stem.wordnet import WordNetLemmatizer import gensim from gensim.utils import simple_preprocess import re import pyLDAvis import pyLDAvis.gensim import pandas as pd import re import gensim.corpora as corpora import spacy import os import gradio as gr import spacy.cli spacy.cli.download("en_core_web_sm") def preprocess_response(response): # Remove punctuation response = re.sub(r'[,\.!?]', '', response) # Convert to lowercase response = response.lower() return response def preprocess_responses(responses): preprocessed_responses = [preprocess_response(response) for response in responses] return preprocessed_responses def sent_to_words(sentences): for sentence in sentences: yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations def make_bigram_trigram_models(data_words): # Build the bigram and trigram models bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases. trigram = gensim.models.Phrases(bigram[data_words], threshold=100) # Faster way to get a sentence clubbed as a trigram/bigram bigram_mod = gensim.models.phrases.Phraser(bigram) trigram_mod = gensim.models.phrases.Phraser(trigram) return bigram_mod, trigram_mod def make_bigrams(texts, bigram_mod): return [bigram_mod[doc] for doc in texts] def remove_stopwords(texts, stop_words): return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts] def lemmatization(texts, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): """https://spacy.io/api/annotation""" texts_out = [] for sent in texts: doc = nlp(" ".join(sent)) texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags]) return texts_out def process_text(texts, stop_words, bigram_mod, nlp): # Remove Stop Words data_words_nostops = remove_stopwords(texts, stop_words) # Form Bigrams data_words_bigrams = make_bigrams(data_words_nostops, bigram_mod) # Do lemmatization keeping only noun, adj, vb, adv data_lemmatized = lemmatization(data_words_bigrams, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) return data_lemmatized def create_lda_model(corpus, id2word, num_topics=6): lda_model = gensim.models.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=100, chunksize=100, passes=10, per_word_topics=True) return lda_model def print_topics(lda_model): #topics = pprint(lda_model.print_topics()) topics = lda_model.print_topics() return topics def visualize_lda_model(lda_model, corpus, id2word): p = pyLDAvis.gensim.prepare(lda_model, corpus, id2word) return p def topic_modeling(reviews): # Split the input string into individual reviews responses = reviews.split("\n") # Preprocess responses preprocessed_responses = preprocess_responses(responses) # Convert responses to words data_words = list(sent_to_words(preprocessed_responses)) # Create stop words list stop_words = stopwords.words('english') stop_words.extend(['from', 'subject', 're', 'edu', 'use']) # Create bigram and trigram models bigram_mod, trigram_mod = make_bigram_trigram_models(data_words) # Initialize spacy 'en' model, keeping only tagger component (for efficiency) nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner']) # Process text (remove stopwords, make bigrams, lemmatize) processed_data = process_text(data_words, stop_words, bigram_mod, nlp) # Create Dictionary and Corpus id2word = corpora.Dictionary(processed_data) corpus = [id2word.doc2bow(text) for text in processed_data] # Create LDA model lda_model = create_lda_model(corpus, id2word, num_topics=6) # Print topics topics = print_topics(lda_model) # Visualize LDA model visualization = visualize_lda_model(lda_model, corpus, id2word) visualization_html = pyLDAvis.prepared_data_to_html(visualization) current_dir = os.path.dirname(__file__) # Get the current working directory filename = f"lda_visualization.html" # Define the filename filepath = os.path.join(current_dir, filename) # Combine directory and filename with open(filepath, "w", encoding="utf-8") as file: file.write(visualization_html) # Write the HTML data to the file print("Successfully saved in", filename) return topics, visualization_html # Interface iface = gr.Interface( fn=topic_modeling, inputs="text", # outputs=["text", "html"] outputs=[ gr.Textbox(label="Topics"), gr.HTML(label="Visualization") ] ) iface.launch(share=True)