Spaces:
Runtime error
Runtime error
import nltk | |
nltk.download('stopwords') | |
nltk.download('wordnet') | |
from nltk.corpus import stopwords | |
from nltk.stem.wordnet import WordNetLemmatizer | |
import gensim | |
from gensim.utils import simple_preprocess | |
import re | |
import pyLDAvis | |
import pyLDAvis.gensim | |
import pandas as pd | |
import re | |
import gensim.corpora as corpora | |
import spacy | |
import os | |
import gradio as gr | |
import spacy.cli | |
spacy.cli.download("en_core_web_sm") | |
def preprocess_response(response): | |
# Remove punctuation | |
response = re.sub(r'[,\.!?]', '', response) | |
# Convert to lowercase | |
response = response.lower() | |
return response | |
def preprocess_responses(responses): | |
preprocessed_responses = [preprocess_response(response) for response in responses] | |
return preprocessed_responses | |
def sent_to_words(sentences): | |
for sentence in sentences: | |
yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations | |
def make_bigram_trigram_models(data_words): | |
# Build the bigram and trigram models | |
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases. | |
trigram = gensim.models.Phrases(bigram[data_words], threshold=100) | |
# Faster way to get a sentence clubbed as a trigram/bigram | |
bigram_mod = gensim.models.phrases.Phraser(bigram) | |
trigram_mod = gensim.models.phrases.Phraser(trigram) | |
return bigram_mod, trigram_mod | |
def make_bigrams(texts, bigram_mod): | |
return [bigram_mod[doc] for doc in texts] | |
def remove_stopwords(texts, stop_words): | |
return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts] | |
def lemmatization(texts, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): | |
"""https://spacy.io/api/annotation""" | |
texts_out = [] | |
for sent in texts: | |
doc = nlp(" ".join(sent)) | |
texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags]) | |
return texts_out | |
def process_text(texts, stop_words, bigram_mod, nlp): | |
# Remove Stop Words | |
data_words_nostops = remove_stopwords(texts, stop_words) | |
# Form Bigrams | |
data_words_bigrams = make_bigrams(data_words_nostops, bigram_mod) | |
# Do lemmatization keeping only noun, adj, vb, adv | |
data_lemmatized = lemmatization(data_words_bigrams, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) | |
return data_lemmatized | |
def create_lda_model(corpus, id2word, num_topics=6): | |
lda_model = gensim.models.LdaModel(corpus=corpus, | |
id2word=id2word, | |
num_topics=num_topics, | |
random_state=100, | |
chunksize=100, | |
passes=10, | |
per_word_topics=True) | |
return lda_model | |
def print_topics(lda_model): | |
#topics = pprint(lda_model.print_topics()) | |
topics = lda_model.print_topics() | |
return topics | |
def visualize_lda_model(lda_model, corpus, id2word): | |
p = pyLDAvis.gensim.prepare(lda_model, corpus, id2word) | |
return p | |
def topic_modeling(reviews): | |
# Split the input string into individual reviews | |
responses = reviews.split("\n") | |
# Preprocess responses | |
preprocessed_responses = preprocess_responses(responses) | |
# Convert responses to words | |
data_words = list(sent_to_words(preprocessed_responses)) | |
# Create stop words list | |
stop_words = stopwords.words('english') | |
stop_words.extend(['from', 'subject', 're', 'edu', 'use']) | |
# Create bigram and trigram models | |
bigram_mod, trigram_mod = make_bigram_trigram_models(data_words) | |
# Initialize spacy 'en' model, keeping only tagger component (for efficiency) | |
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner']) | |
# Process text (remove stopwords, make bigrams, lemmatize) | |
processed_data = process_text(data_words, stop_words, bigram_mod, nlp) | |
# Create Dictionary and Corpus | |
id2word = corpora.Dictionary(processed_data) | |
corpus = [id2word.doc2bow(text) for text in processed_data] | |
# Create LDA model | |
lda_model = create_lda_model(corpus, id2word, num_topics=6) | |
# Print topics | |
topics = print_topics(lda_model) | |
# Visualize LDA model | |
visualization = visualize_lda_model(lda_model, corpus, id2word) | |
visualization_html = pyLDAvis.prepared_data_to_html(visualization) | |
current_dir = os.path.dirname(__file__) # Get the current working directory | |
filename = f"lda_visualization.html" # Define the filename | |
filepath = os.path.join(current_dir, filename) # Combine directory and filename | |
with open(filepath, "w", encoding="utf-8") as file: | |
file.write(visualization_html) # Write the HTML data to the file | |
print("Successfully saved in", filename) | |
return topics, visualization_html | |
# Interface | |
iface = gr.Interface( | |
fn=topic_modeling, | |
inputs="text", | |
# outputs=["text", "html"] | |
outputs=[ | |
gr.Textbox(label="Topics"), | |
gr.HTML(label="Visualization") | |
] | |
) | |
iface.launch(share=True) | |