Spaces:
Sleeping
Sleeping
File size: 5,088 Bytes
d94e8a0 67ead70 d94e8a0 7acad9d d94e8a0 154e1a5 d94e8a0 154e1a5 d94e8a0 5453656 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import gensim
from gensim.utils import simple_preprocess
import re
import pyLDAvis
import pyLDAvis.gensim
import pandas as pd
import re
import gensim.corpora as corpora
import spacy
import os
import gradio as gr
import spacy.cli
spacy.cli.download("en_core_web_sm")
def preprocess_response(response):
# Remove punctuation
response = re.sub(r'[,\.!?]', '', response)
# Convert to lowercase
response = response.lower()
return response
def preprocess_responses(responses):
preprocessed_responses = [preprocess_response(response) for response in responses]
return preprocessed_responses
def sent_to_words(sentences):
for sentence in sentences:
yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations
def make_bigram_trigram_models(data_words):
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
return bigram_mod, trigram_mod
def make_bigrams(texts, bigram_mod):
return [bigram_mod[doc] for doc in texts]
def remove_stopwords(texts, stop_words):
return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
def lemmatization(texts, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
"""https://spacy.io/api/annotation"""
texts_out = []
for sent in texts:
doc = nlp(" ".join(sent))
texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
return texts_out
def process_text(texts, stop_words, bigram_mod, nlp):
# Remove Stop Words
data_words_nostops = remove_stopwords(texts, stop_words)
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops, bigram_mod)
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
return data_lemmatized
def create_lda_model(corpus, id2word, num_topics=6):
lda_model = gensim.models.LdaModel(corpus=corpus,
id2word=id2word,
num_topics=num_topics,
random_state=100,
chunksize=100,
passes=10,
per_word_topics=True)
return lda_model
def print_topics(lda_model):
#topics = pprint(lda_model.print_topics())
topics = lda_model.print_topics()
return topics
def visualize_lda_model(lda_model, corpus, id2word):
p = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
return p
def topic_modeling(reviews):
# Split the input string into individual reviews
responses = reviews.split("\n")
# Preprocess responses
preprocessed_responses = preprocess_responses(responses)
# Convert responses to words
data_words = list(sent_to_words(preprocessed_responses))
# Create stop words list
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
# Create bigram and trigram models
bigram_mod, trigram_mod = make_bigram_trigram_models(data_words)
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
# Process text (remove stopwords, make bigrams, lemmatize)
processed_data = process_text(data_words, stop_words, bigram_mod, nlp)
# Create Dictionary and Corpus
id2word = corpora.Dictionary(processed_data)
corpus = [id2word.doc2bow(text) for text in processed_data]
# Create LDA model
lda_model = create_lda_model(corpus, id2word, num_topics=6)
# Print topics
topics = print_topics(lda_model)
# Visualize LDA model
visualization = visualize_lda_model(lda_model, corpus, id2word)
visualization_html = pyLDAvis.prepared_data_to_html(visualization)
current_dir = os.path.dirname(__file__) # Get the current working directory
filename = f"lda_visualization.html" # Define the filename
filepath = os.path.join(current_dir, filename) # Combine directory and filename
with open(filepath, "w", encoding="utf-8") as file:
file.write(visualization_html) # Write the HTML data to the file
print("Successfully saved in", filename)
return topics, visualization_html
# Interface
iface = gr.Interface(
fn=topic_modeling,
inputs="text",
# outputs=["text", "html"]
outputs=[
gr.Textbox(label="Topics"),
gr.HTML(label="Visualization")
]
)
iface.launch(share=True)
#rebuild |