topic_modelling / app.py
nibbz2024's picture
Update app.py
154e1a5 verified
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import gensim
from gensim.utils import simple_preprocess
import re
import pyLDAvis
import pyLDAvis.gensim
import pandas as pd
import re
import gensim.corpora as corpora
import spacy
import os
import gradio as gr
import spacy.cli
spacy.cli.download("en_core_web_sm")
def preprocess_response(response):
# Remove punctuation
response = re.sub(r'[,\.!?]', '', response)
# Convert to lowercase
response = response.lower()
return response
def preprocess_responses(responses):
preprocessed_responses = [preprocess_response(response) for response in responses]
return preprocessed_responses
def sent_to_words(sentences):
for sentence in sentences:
yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations
def make_bigram_trigram_models(data_words):
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
return bigram_mod, trigram_mod
def make_bigrams(texts, bigram_mod):
return [bigram_mod[doc] for doc in texts]
def remove_stopwords(texts, stop_words):
return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
def lemmatization(texts, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
"""https://spacy.io/api/annotation"""
texts_out = []
for sent in texts:
doc = nlp(" ".join(sent))
texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
return texts_out
def process_text(texts, stop_words, bigram_mod, nlp):
# Remove Stop Words
data_words_nostops = remove_stopwords(texts, stop_words)
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops, bigram_mod)
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
return data_lemmatized
def create_lda_model(corpus, id2word, num_topics=6):
lda_model = gensim.models.LdaModel(corpus=corpus,
id2word=id2word,
num_topics=num_topics,
random_state=100,
chunksize=100,
passes=10,
per_word_topics=True)
return lda_model
def print_topics(lda_model):
#topics = pprint(lda_model.print_topics())
topics = lda_model.print_topics()
return topics
def visualize_lda_model(lda_model, corpus, id2word):
p = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
return p
def topic_modeling(reviews):
# Split the input string into individual reviews
responses = reviews.split("\n")
# Preprocess responses
preprocessed_responses = preprocess_responses(responses)
# Convert responses to words
data_words = list(sent_to_words(preprocessed_responses))
# Create stop words list
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
# Create bigram and trigram models
bigram_mod, trigram_mod = make_bigram_trigram_models(data_words)
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
# Process text (remove stopwords, make bigrams, lemmatize)
processed_data = process_text(data_words, stop_words, bigram_mod, nlp)
# Create Dictionary and Corpus
id2word = corpora.Dictionary(processed_data)
corpus = [id2word.doc2bow(text) for text in processed_data]
# Create LDA model
lda_model = create_lda_model(corpus, id2word, num_topics=6)
# Print topics
topics = print_topics(lda_model)
# Visualize LDA model
visualization = visualize_lda_model(lda_model, corpus, id2word)
visualization_html = pyLDAvis.prepared_data_to_html(visualization)
current_dir = os.path.dirname(__file__) # Get the current working directory
filename = f"lda_visualization.html" # Define the filename
filepath = os.path.join(current_dir, filename) # Combine directory and filename
with open(filepath, "w", encoding="utf-8") as file:
file.write(visualization_html) # Write the HTML data to the file
print("Successfully saved in", filename)
return topics, visualization_html
# Interface
iface = gr.Interface(
fn=topic_modeling,
inputs="text",
# outputs=["text", "html"]
outputs=[
gr.Textbox(label="Topics"),
gr.HTML(label="Visualization")
]
)
iface.launch(share=True)