Spaces:

nibbz2024
/

topic_modelling

Sleeping

App Files Files Community

nibbz2024 commited on Mar 22

Commit

d94e8a0

•

1 Parent(s): 8879f07

Create app.py

Browse files

Files changed (1) hide show

app.py +160 -0

app.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import nltk
+# nltk.download('stopwords')
+# nltk.download('wordnet')
+from nltk.corpus import stopwords
+from nltk.stem.wordnet import WordNetLemmatizer
+import string
+import gensim
+from gensim.utils import simple_preprocess
+import pickle
+import re
+import pyLDAvis
+import pyLDAvis.gensim
+import matplotlib.pyplot as plt
+import pandas as pd
+import re
+from pprint import pprint
+import gensim.corpora as corpora
+import spacy
+import datetime
+import os
+import gradio as gr
+def preprocess_response(response):
+    # Remove punctuation
+    response = re.sub(r'[,\.!?]', '', response)
+    # Convert to lowercase
+    response = response.lower()
+    return response
+def preprocess_responses(responses):
+    preprocessed_responses = [preprocess_response(response) for response in responses]
+    return preprocessed_responses
+def sent_to_words(sentences):
+    for sentence in sentences:
+        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
+def make_bigram_trigram_models(data_words):
+    # Build the bigram and trigram models
+    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
+    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
+    # Faster way to get a sentence clubbed as a trigram/bigram
+    bigram_mod = gensim.models.phrases.Phraser(bigram)
+    trigram_mod = gensim.models.phrases.Phraser(trigram)
+    return bigram_mod, trigram_mod
+def make_bigrams(texts, bigram_mod):
+    return [bigram_mod[doc] for doc in texts]
+def remove_stopwords(texts, stop_words):
+    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
+def lemmatization(texts, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
+    """https://spacy.io/api/annotation"""
+    texts_out = []
+    for sent in texts:
+        doc = nlp(" ".join(sent))
+        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
+    return texts_out
+def process_text(texts, stop_words, bigram_mod, nlp):
+    # Remove Stop Words
+    data_words_nostops = remove_stopwords(texts, stop_words)
+    # Form Bigrams
+    data_words_bigrams = make_bigrams(data_words_nostops, bigram_mod)
+    # Do lemmatization keeping only noun, adj, vb, adv
+    data_lemmatized = lemmatization(data_words_bigrams, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
+    return data_lemmatized
+def create_lda_model(corpus, id2word, num_topics=10):
+    lda_model = gensim.models.LdaModel(corpus=corpus,
+                                       id2word=id2word,
+                                       num_topics=num_topics,
+                                       random_state=100,
+                                       chunksize=100,
+                                       passes=10,
+                                       per_word_topics=True)
+    return lda_model
+def print_topics(lda_model):
+    #topics = pprint(lda_model.print_topics())
+    topics = lda_model.print_topics()
+    return topics
+def visualize_lda_model(lda_model, corpus, id2word):
+    p = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
+    return p
+def topic_modeling(reviews):
+    # Split the input string into individual reviews
+    responses = reviews.split("\n")
+    # Preprocess responses
+    preprocessed_responses = preprocess_responses(responses)
+    # Convert responses to words
+    data_words = list(sent_to_words(preprocessed_responses))
+    # Create stop words list
+    stop_words = stopwords.words('english')
+    stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
+    # Create bigram and trigram models
+    bigram_mod, trigram_mod = make_bigram_trigram_models(data_words)
+    # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
+    nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
+    # Process text (remove stopwords, make bigrams, lemmatize)
+    processed_data = process_text(data_words, stop_words, bigram_mod, nlp)
+    # Create Dictionary and Corpus
+    id2word = corpora.Dictionary(processed_data)
+    corpus = [id2word.doc2bow(text) for text in processed_data]
+    # Create LDA model
+    lda_model = create_lda_model(corpus, id2word, num_topics=10)
+    # Print topics
+    topics = print_topics(lda_model)
+    # Visualize LDA model
+    visualization = visualize_lda_model(lda_model, corpus, id2word)
+    visualization_html = pyLDAvis.prepared_data_to_html(visualization)
+    current_dir = os.path.dirname(__file__)  # Get the current working directory
+    filename = f"lda_visualization.html"  # Define the filename
+    filepath = os.path.join(current_dir, filename)  # Combine directory and filename
+    with open(filepath, "w", encoding="utf-8") as file:
+        file.write(visualization_html)  # Write the HTML data to the file
+    print("Successfully saved in", filename)
+    return topics, visualization_html
+# Interface
+iface = gr.Interface(
+    fn=topic_modeling,
+    inputs="text",
+    # outputs=["text", "html"]
+    outputs=[
+    gr.Textbox(label="Topics"),
+    gr.HTML(label="Visualization")
+]
+)
+iface.launch(share=True)