Spaces:

nibbz2024
/

topic_modelling

Sleeping

App Files Files Community

topic_modelling / app.py

nibbz2024

Update app.py

5453656 verified over 1 year ago

raw

history blame contribute delete

5.09 kB

	import nltk
	nltk.download('stopwords')
	nltk.download('wordnet')
	from nltk.corpus import stopwords
	from nltk.stem.wordnet import WordNetLemmatizer

	import gensim
	from gensim.utils import simple_preprocess

	import re
	import pyLDAvis
	import pyLDAvis.gensim

	import pandas as pd

	import re
	import gensim.corpora as corpora
	import spacy
	import os

	import gradio as gr


	import spacy.cli
	spacy.cli.download("en_core_web_sm")

	def preprocess_response(response):
	# Remove punctuation
	response = re.sub(r'[,\.!?]', '', response)
	# Convert to lowercase
	response = response.lower()
	return response

	def preprocess_responses(responses):
	preprocessed_responses = [preprocess_response(response) for response in responses]
	return preprocessed_responses

	def sent_to_words(sentences):
	for sentence in sentences:
	yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations

	def make_bigram_trigram_models(data_words):
	# Build the bigram and trigram models
	bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
	trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

	# Faster way to get a sentence clubbed as a trigram/bigram
	bigram_mod = gensim.models.phrases.Phraser(bigram)
	trigram_mod = gensim.models.phrases.Phraser(trigram)

	return bigram_mod, trigram_mod

	def make_bigrams(texts, bigram_mod):
	return [bigram_mod[doc] for doc in texts]

	def remove_stopwords(texts, stop_words):
	return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

	def lemmatization(texts, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
	"""https://spacy.io/api/annotation"""
	texts_out = []
	for sent in texts:
	doc = nlp(" ".join(sent))
	texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
	return texts_out

	def process_text(texts, stop_words, bigram_mod, nlp):
	# Remove Stop Words
	data_words_nostops = remove_stopwords(texts, stop_words)

	# Form Bigrams
	data_words_bigrams = make_bigrams(data_words_nostops, bigram_mod)

	# Do lemmatization keeping only noun, adj, vb, adv
	data_lemmatized = lemmatization(data_words_bigrams, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

	return data_lemmatized

	def create_lda_model(corpus, id2word, num_topics=6):
	lda_model = gensim.models.LdaModel(corpus=corpus,
	id2word=id2word,
	num_topics=num_topics,
	random_state=100,
	chunksize=100,
	passes=10,
	per_word_topics=True)
	return lda_model

	def print_topics(lda_model):
	#topics = pprint(lda_model.print_topics())
	topics = lda_model.print_topics()
	return topics

	def visualize_lda_model(lda_model, corpus, id2word):
	p = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
	return p

	def topic_modeling(reviews):
	# Split the input string into individual reviews
	responses = reviews.split("\n")

	# Preprocess responses
	preprocessed_responses = preprocess_responses(responses)

	# Convert responses to words
	data_words = list(sent_to_words(preprocessed_responses))

	# Create stop words list
	stop_words = stopwords.words('english')
	stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

	# Create bigram and trigram models
	bigram_mod, trigram_mod = make_bigram_trigram_models(data_words)

	# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
	nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

	# Process text (remove stopwords, make bigrams, lemmatize)
	processed_data = process_text(data_words, stop_words, bigram_mod, nlp)

	# Create Dictionary and Corpus
	id2word = corpora.Dictionary(processed_data)
	corpus = [id2word.doc2bow(text) for text in processed_data]

	# Create LDA model
	lda_model = create_lda_model(corpus, id2word, num_topics=6)

	# Print topics
	topics = print_topics(lda_model)

	# Visualize LDA model
	visualization = visualize_lda_model(lda_model, corpus, id2word)

	visualization_html = pyLDAvis.prepared_data_to_html(visualization)

	current_dir = os.path.dirname(__file__) # Get the current working directory
	filename = f"lda_visualization.html" # Define the filename
	filepath = os.path.join(current_dir, filename) # Combine directory and filename
	with open(filepath, "w", encoding="utf-8") as file:
	file.write(visualization_html) # Write the HTML data to the file
	print("Successfully saved in", filename)


	return topics, visualization_html

	# Interface
	iface = gr.Interface(
	fn=topic_modeling,
	inputs="text",
	# outputs=["text", "html"]
	outputs=[
	gr.Textbox(label="Topics"),
	gr.HTML(label="Visualization")
	]
	)

	iface.launch(share=True)

	#rebuild