File size: 5,088 Bytes
d94e8a0
67ead70
 
d94e8a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7acad9d
 
 
 
d94e8a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154e1a5
d94e8a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154e1a5
d94e8a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5453656
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

import gensim
from gensim.utils import simple_preprocess

import re
import pyLDAvis
import pyLDAvis.gensim

import pandas as pd

import re
import gensim.corpora as corpora
import spacy
import os

import gradio as gr


import spacy.cli
spacy.cli.download("en_core_web_sm")

def preprocess_response(response):
    # Remove punctuation
    response = re.sub(r'[,\.!?]', '', response)
    # Convert to lowercase
    response = response.lower()
    return response

def preprocess_responses(responses):
    preprocessed_responses = [preprocess_response(response) for response in responses]
    return preprocessed_responses

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

def make_bigram_trigram_models(data_words):
    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    return bigram_mod, trigram_mod

def make_bigrams(texts, bigram_mod):
    return [bigram_mod[doc] for doc in texts]

def remove_stopwords(texts, stop_words):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def lemmatization(texts, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

def process_text(texts, stop_words, bigram_mod, nlp):
    # Remove Stop Words
    data_words_nostops = remove_stopwords(texts, stop_words)

    # Form Bigrams
    data_words_bigrams = make_bigrams(data_words_nostops, bigram_mod)

    # Do lemmatization keeping only noun, adj, vb, adv
    data_lemmatized = lemmatization(data_words_bigrams, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    return data_lemmatized

def create_lda_model(corpus, id2word, num_topics=6):
    lda_model = gensim.models.LdaModel(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)
    return lda_model

def print_topics(lda_model):
    #topics = pprint(lda_model.print_topics())
    topics = lda_model.print_topics()
    return topics

def visualize_lda_model(lda_model, corpus, id2word):
    p = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    return p

def topic_modeling(reviews):
    # Split the input string into individual reviews
    responses = reviews.split("\n")
    
    # Preprocess responses
    preprocessed_responses = preprocess_responses(responses)

    # Convert responses to words
    data_words = list(sent_to_words(preprocessed_responses))

    # Create stop words list
    stop_words = stopwords.words('english')
    stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

    # Create bigram and trigram models
    bigram_mod, trigram_mod = make_bigram_trigram_models(data_words)

    # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
    nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

    # Process text (remove stopwords, make bigrams, lemmatize)
    processed_data = process_text(data_words, stop_words, bigram_mod, nlp)

    # Create Dictionary and Corpus
    id2word = corpora.Dictionary(processed_data)
    corpus = [id2word.doc2bow(text) for text in processed_data]

    # Create LDA model
    lda_model = create_lda_model(corpus, id2word, num_topics=6)

    # Print topics
    topics = print_topics(lda_model)
    
    # Visualize LDA model
    visualization = visualize_lda_model(lda_model, corpus, id2word)

    visualization_html = pyLDAvis.prepared_data_to_html(visualization)

    current_dir = os.path.dirname(__file__)  # Get the current working directory
    filename = f"lda_visualization.html"  # Define the filename
    filepath = os.path.join(current_dir, filename)  # Combine directory and filename
    with open(filepath, "w", encoding="utf-8") as file:
        file.write(visualization_html)  # Write the HTML data to the file
    print("Successfully saved in", filename)


    return topics, visualization_html

# Interface
iface = gr.Interface(
    fn=topic_modeling, 
    inputs="text",
    # outputs=["text", "html"]
    outputs=[
    gr.Textbox(label="Topics"),
    gr.HTML(label="Visualization")
]
)

iface.launch(share=True)

#rebuild