nibbz2024 commited on
Commit
d94e8a0
1 Parent(s): 8879f07

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +160 -0
app.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ # nltk.download('stopwords')
3
+ # nltk.download('wordnet')
4
+ from nltk.corpus import stopwords
5
+ from nltk.stem.wordnet import WordNetLemmatizer
6
+ import string
7
+
8
+ import gensim
9
+ from gensim.utils import simple_preprocess
10
+
11
+ import pickle
12
+ import re
13
+ import pyLDAvis
14
+ import pyLDAvis.gensim
15
+
16
+ import matplotlib.pyplot as plt
17
+ import pandas as pd
18
+
19
+ import re
20
+ from pprint import pprint
21
+ import gensim.corpora as corpora
22
+ import spacy
23
+
24
+ import datetime
25
+ import os
26
+
27
+ import gradio as gr
28
+
29
+ def preprocess_response(response):
30
+ # Remove punctuation
31
+ response = re.sub(r'[,\.!?]', '', response)
32
+ # Convert to lowercase
33
+ response = response.lower()
34
+ return response
35
+
36
+ def preprocess_responses(responses):
37
+ preprocessed_responses = [preprocess_response(response) for response in responses]
38
+ return preprocessed_responses
39
+
40
+ def sent_to_words(sentences):
41
+ for sentence in sentences:
42
+ yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations
43
+
44
+ def make_bigram_trigram_models(data_words):
45
+ # Build the bigram and trigram models
46
+ bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
47
+ trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
48
+
49
+ # Faster way to get a sentence clubbed as a trigram/bigram
50
+ bigram_mod = gensim.models.phrases.Phraser(bigram)
51
+ trigram_mod = gensim.models.phrases.Phraser(trigram)
52
+
53
+ return bigram_mod, trigram_mod
54
+
55
+ def make_bigrams(texts, bigram_mod):
56
+ return [bigram_mod[doc] for doc in texts]
57
+
58
+ def remove_stopwords(texts, stop_words):
59
+ return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
60
+
61
+ def lemmatization(texts, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
62
+ """https://spacy.io/api/annotation"""
63
+ texts_out = []
64
+ for sent in texts:
65
+ doc = nlp(" ".join(sent))
66
+ texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
67
+ return texts_out
68
+
69
+ def process_text(texts, stop_words, bigram_mod, nlp):
70
+ # Remove Stop Words
71
+ data_words_nostops = remove_stopwords(texts, stop_words)
72
+
73
+ # Form Bigrams
74
+ data_words_bigrams = make_bigrams(data_words_nostops, bigram_mod)
75
+
76
+ # Do lemmatization keeping only noun, adj, vb, adv
77
+ data_lemmatized = lemmatization(data_words_bigrams, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
78
+
79
+ return data_lemmatized
80
+
81
+ def create_lda_model(corpus, id2word, num_topics=10):
82
+ lda_model = gensim.models.LdaModel(corpus=corpus,
83
+ id2word=id2word,
84
+ num_topics=num_topics,
85
+ random_state=100,
86
+ chunksize=100,
87
+ passes=10,
88
+ per_word_topics=True)
89
+ return lda_model
90
+
91
+ def print_topics(lda_model):
92
+ #topics = pprint(lda_model.print_topics())
93
+ topics = lda_model.print_topics()
94
+ return topics
95
+
96
+ def visualize_lda_model(lda_model, corpus, id2word):
97
+ p = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
98
+ return p
99
+
100
+ def topic_modeling(reviews):
101
+ # Split the input string into individual reviews
102
+ responses = reviews.split("\n")
103
+
104
+ # Preprocess responses
105
+ preprocessed_responses = preprocess_responses(responses)
106
+
107
+ # Convert responses to words
108
+ data_words = list(sent_to_words(preprocessed_responses))
109
+
110
+ # Create stop words list
111
+ stop_words = stopwords.words('english')
112
+ stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
113
+
114
+ # Create bigram and trigram models
115
+ bigram_mod, trigram_mod = make_bigram_trigram_models(data_words)
116
+
117
+ # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
118
+ nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
119
+
120
+ # Process text (remove stopwords, make bigrams, lemmatize)
121
+ processed_data = process_text(data_words, stop_words, bigram_mod, nlp)
122
+
123
+ # Create Dictionary and Corpus
124
+ id2word = corpora.Dictionary(processed_data)
125
+ corpus = [id2word.doc2bow(text) for text in processed_data]
126
+
127
+ # Create LDA model
128
+ lda_model = create_lda_model(corpus, id2word, num_topics=10)
129
+
130
+ # Print topics
131
+ topics = print_topics(lda_model)
132
+
133
+ # Visualize LDA model
134
+ visualization = visualize_lda_model(lda_model, corpus, id2word)
135
+
136
+ visualization_html = pyLDAvis.prepared_data_to_html(visualization)
137
+
138
+ current_dir = os.path.dirname(__file__) # Get the current working directory
139
+ filename = f"lda_visualization.html" # Define the filename
140
+ filepath = os.path.join(current_dir, filename) # Combine directory and filename
141
+ with open(filepath, "w", encoding="utf-8") as file:
142
+ file.write(visualization_html) # Write the HTML data to the file
143
+ print("Successfully saved in", filename)
144
+
145
+
146
+ return topics, visualization_html
147
+
148
+ # Interface
149
+ iface = gr.Interface(
150
+ fn=topic_modeling,
151
+ inputs="text",
152
+ # outputs=["text", "html"]
153
+ outputs=[
154
+ gr.Textbox(label="Topics"),
155
+ gr.HTML(label="Visualization")
156
+ ]
157
+ )
158
+
159
+ iface.launch(share=True)
160
+