Spaces:
Runtime error
Runtime error
| import nltk | |
| nltk.download('stopwords') | |
| nltk.download('wordnet') | |
| from nltk.corpus import stopwords | |
| from nltk.stem.wordnet import WordNetLemmatizer | |
| import gensim | |
| from gensim.utils import simple_preprocess | |
| import re | |
| import pyLDAvis | |
| import pyLDAvis.gensim | |
| import pandas as pd | |
| import re | |
| import gensim.corpora as corpora | |
| import spacy | |
| import os | |
| import gradio as gr | |
| import spacy.cli | |
| spacy.cli.download("en_core_web_sm") | |
| def preprocess_response(response): | |
| # Remove punctuation | |
| response = re.sub(r'[,\.!?]', '', response) | |
| # Convert to lowercase | |
| response = response.lower() | |
| return response | |
| def preprocess_responses(responses): | |
| preprocessed_responses = [preprocess_response(response) for response in responses] | |
| return preprocessed_responses | |
| def sent_to_words(sentences): | |
| for sentence in sentences: | |
| yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations | |
| def make_bigram_trigram_models(data_words): | |
| # Build the bigram and trigram models | |
| bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases. | |
| trigram = gensim.models.Phrases(bigram[data_words], threshold=100) | |
| # Faster way to get a sentence clubbed as a trigram/bigram | |
| bigram_mod = gensim.models.phrases.Phraser(bigram) | |
| trigram_mod = gensim.models.phrases.Phraser(trigram) | |
| return bigram_mod, trigram_mod | |
| def make_bigrams(texts, bigram_mod): | |
| return [bigram_mod[doc] for doc in texts] | |
| def remove_stopwords(texts, stop_words): | |
| return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts] | |
| def lemmatization(texts, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): | |
| """https://spacy.io/api/annotation""" | |
| texts_out = [] | |
| for sent in texts: | |
| doc = nlp(" ".join(sent)) | |
| texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags]) | |
| return texts_out | |
| def process_text(texts, stop_words, bigram_mod, nlp): | |
| # Remove Stop Words | |
| data_words_nostops = remove_stopwords(texts, stop_words) | |
| # Form Bigrams | |
| data_words_bigrams = make_bigrams(data_words_nostops, bigram_mod) | |
| # Do lemmatization keeping only noun, adj, vb, adv | |
| data_lemmatized = lemmatization(data_words_bigrams, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) | |
| return data_lemmatized | |
| def create_lda_model(corpus, id2word, num_topics=6): | |
| lda_model = gensim.models.LdaModel(corpus=corpus, | |
| id2word=id2word, | |
| num_topics=num_topics, | |
| random_state=100, | |
| chunksize=100, | |
| passes=10, | |
| per_word_topics=True) | |
| return lda_model | |
| def print_topics(lda_model): | |
| #topics = pprint(lda_model.print_topics()) | |
| topics = lda_model.print_topics() | |
| return topics | |
| def visualize_lda_model(lda_model, corpus, id2word): | |
| p = pyLDAvis.gensim.prepare(lda_model, corpus, id2word) | |
| return p | |
| def topic_modeling(reviews): | |
| # Split the input string into individual reviews | |
| responses = reviews.split("\n") | |
| # Preprocess responses | |
| preprocessed_responses = preprocess_responses(responses) | |
| # Convert responses to words | |
| data_words = list(sent_to_words(preprocessed_responses)) | |
| # Create stop words list | |
| stop_words = stopwords.words('english') | |
| stop_words.extend(['from', 'subject', 're', 'edu', 'use']) | |
| # Create bigram and trigram models | |
| bigram_mod, trigram_mod = make_bigram_trigram_models(data_words) | |
| # Initialize spacy 'en' model, keeping only tagger component (for efficiency) | |
| nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner']) | |
| # Process text (remove stopwords, make bigrams, lemmatize) | |
| processed_data = process_text(data_words, stop_words, bigram_mod, nlp) | |
| # Create Dictionary and Corpus | |
| id2word = corpora.Dictionary(processed_data) | |
| corpus = [id2word.doc2bow(text) for text in processed_data] | |
| # Create LDA model | |
| lda_model = create_lda_model(corpus, id2word, num_topics=6) | |
| # Print topics | |
| topics = print_topics(lda_model) | |
| # Visualize LDA model | |
| visualization = visualize_lda_model(lda_model, corpus, id2word) | |
| visualization_html = pyLDAvis.prepared_data_to_html(visualization) | |
| current_dir = os.path.dirname(__file__) # Get the current working directory | |
| filename = f"lda_visualization.html" # Define the filename | |
| filepath = os.path.join(current_dir, filename) # Combine directory and filename | |
| with open(filepath, "w", encoding="utf-8") as file: | |
| file.write(visualization_html) # Write the HTML data to the file | |
| print("Successfully saved in", filename) | |
| return topics, visualization_html | |
| # Interface | |
| iface = gr.Interface( | |
| fn=topic_modeling, | |
| inputs="text", | |
| # outputs=["text", "html"] | |
| outputs=[ | |
| gr.Textbox(label="Topics"), | |
| gr.HTML(label="Visualization") | |
| ] | |
| ) | |
| iface.launch(share=True) | |