Harsh502s commited on
Commit
c5d623b
Β·
1 Parent(s): ade606e
Models/bin.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0a4fbec22e1a6e06396e8b1c384d5d541b6c0dfd2cec61a8c9a4f7e1179db0c
3
+ size 756
Models/stackexchange_topic_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81f72e9da968496087c2dbe77cc06e1937789099c7b69380e9cebd5ab0a357f8
3
+ size 438242069
Models/tag_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b304a86faeda3cd1ff63af09e57bec8ba0c98d5bdb30613e7fcb08ee1f57b9c
3
+ size 77937800
Models/token.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72c3d62f1cc157956c2131619934ced13e82052e23fa4efe60f104a6632d2a5c
3
+ size 1961509
Pages/1_πŸ“Š_Topic Model Results.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from bertopic import BERTopic
3
+
4
+
5
+ @st.cache_resource
6
+ def load_model():
7
+ return BERTopic.load(r"Models/stackexchange_topic_model.pkl")
8
+
9
+
10
+ bertopic_model = load_model()
11
+
12
+
13
+ def topic_model_results():
14
+ st.title("Topic Model Results")
15
+ tab1, tab2, tab3, tab4, tab5 = st.tabs(
16
+ [
17
+ "Topic Word Score",
18
+ "Intertopic Distance Map",
19
+ "Topic Probability Distribution",
20
+ "Visualize Hierarchical Topics",
21
+ "Visualize Topics Heatmap",
22
+ ]
23
+ )
24
+ with tab1:
25
+ st.write(bertopic_model.visualize_barchart(top_n_topics=20))
26
+ with tab2:
27
+ st.write(bertopic_model.visualize_topics())
28
+ with tab3:
29
+ st.write(
30
+ bertopic_model.visualize_distribution(
31
+ bertopic_model.probabilities_[0], min_probability=0.015
32
+ )
33
+ )
34
+ with tab4:
35
+ st.write(bertopic_model.visualize_hierarchy())
36
+ with tab5:
37
+ st.write(bertopic_model.visualize_heatmap())
38
+
39
+
40
+ if __name__ == "__main__":
41
+ topic_model_results()
Pages/2_πŸ€–_Models.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from streamlit_extras.tags import tagger_component
3
+ import re
4
+ import pickle
5
+ from keybert import KeyBERT
6
+ from bertopic import BERTopic
7
+ from keras.models import load_model
8
+ from keras.preprocessing.sequence import pad_sequences
9
+
10
+
11
+ # Load the BERTopic model
12
+ @st.cache_resource
13
+ def load_models():
14
+ return (
15
+ BERTopic.load(r"Models/stackexchange_topic_model.pkl"),
16
+ KeyBERT("all-MiniLM-L6-v2"),
17
+ load_model(r"Models/tag_model.h5"),
18
+ pickle.load(open(r"Models/token.pkl", "rb")),
19
+ pickle.load(open(r"Models/bin.pkl", "rb")),
20
+ )
21
+
22
+
23
+ # Load the model into memory
24
+ bertopic_model, keybert_model, cnn_model, tokenizer, binarizer = load_models()
25
+
26
+
27
+ # Clean the input text
28
+ def clean_text(text):
29
+ text = re.sub(r"<.*?>", "", text)
30
+ text = re.sub(r"[^A-Za-z']", " ", text)
31
+ text = re.sub(r"\s+", " ", text)
32
+ return text
33
+
34
+
35
+ # Assign tags to the input text using the CNN model
36
+ def tag_cnn_model(text):
37
+ text = clean_text(text)
38
+ text = tokenizer.texts_to_sequences([text])
39
+ text_padded = pad_sequences(text, maxlen=512)
40
+ q_pred = cnn_model.predict(text_padded)
41
+ q_pred = (q_pred >= 0.25).astype(int)
42
+ return binarizer.inverse_transform(q_pred)
43
+
44
+
45
+ # Retrieve the keyphrases from the input text using the KeyBERT model
46
+ def retrieve_keyphrases(text, n, ngram_range):
47
+ keywords = keybert_model.extract_keywords(
48
+ text,
49
+ keyphrase_ngram_range=ngram_range,
50
+ top_n=n,
51
+ diversity=0.5,
52
+ use_maxsum=True,
53
+ use_mmr=True,
54
+ seed_keywords=[
55
+ "machine-learning",
56
+ "r",
57
+ "regression",
58
+ "deep-learning",
59
+ "neural-networks",
60
+ "data-request",
61
+ "python",
62
+ "reinforcement-learning",
63
+ "classification",
64
+ "time-series",
65
+ "probability",
66
+ "neural-network",
67
+ "distributions",
68
+ "bayesian",
69
+ "hypothesis-testing",
70
+ "keras",
71
+ "mathematical-statistics",
72
+ "scikit-learn",
73
+ "logistic",
74
+ "convolutional-neural-networks",
75
+ "clustering",
76
+ "tensorflow",
77
+ "terminology",
78
+ "nlp",
79
+ "correlation",
80
+ "self-study",
81
+ "normal-distribution",
82
+ "geospatial",
83
+ "cross-validation",
84
+ "optimization",
85
+ "random-forest",
86
+ "mixed-model",
87
+ "data-mining",
88
+ "feature-selection",
89
+ "pca",
90
+ "references",
91
+ "computer-vision",
92
+ "data-visualization",
93
+ "confidence-interval",
94
+ "generalized-linear-model",
95
+ "variance",
96
+ "natural-language-processing",
97
+ "dataset",
98
+ "svm",
99
+ "training",
100
+ "maximum-likelihood",
101
+ "statistical-significance",
102
+ "gradient-descent",
103
+ "multiple-regression",
104
+ "estimation",
105
+ ],
106
+ )
107
+ return sorted(keywords, key=lambda x: x[1], reverse=True)
108
+
109
+
110
+ # Find the most similar topics for the input text using the BERTopic model
111
+ def output_unsupervised(text, n):
112
+ new_review = text
113
+ similar_topics, similarity = bertopic_model.find_topics(new_review, top_n=n)
114
+ similar_topics = sorted(similar_topics)
115
+ for i in range(n):
116
+ tags = bertopic_model.get_topic(similar_topics[i])
117
+ tags = [tag[0] for tag in tags]
118
+ tagger_component(f"Tags from cluster {i+1}:", tags, color_name="red")
119
+
120
+
121
+ # Display the supervised model page of the app
122
+ def supervised_page():
123
+ st.header("Supervised Model")
124
+ text = st.text_area("Enter text to assign tags", height=200, key="supervised_text")
125
+ text = clean_text(text)
126
+ if st.button("Assign tags", key="supervised_button"):
127
+ tags = tag_cnn_model(text)[0]
128
+ tagger_component("Tags:", tags, color_name="red")
129
+
130
+
131
+ # Display the unsupervised model using bertopic page of the app
132
+ def unsupervised_page_bertopic():
133
+ st.header("Unsupervised Model Using BERTopic Model")
134
+ text = st.text_area(
135
+ "Enter text to assign tags", height=200, key="unsupervised_text_bertopic"
136
+ )
137
+ text = clean_text(text)
138
+ n = st.number_input(
139
+ "Enter number of tags to assign", value=5, key="unsupervised_n_bertopic"
140
+ )
141
+ if st.button("Assign tags", key="unsupervised_button_bertopic"):
142
+ output_unsupervised(text, n)
143
+
144
+
145
+ # Display the unsupervised model using keybert page of the app
146
+ def unsupervised_page_keybert():
147
+ st.header("Unsupervised Model Using KeyBERT Model")
148
+ text = st.text_area(
149
+ "Enter text to assign tags", height=200, key="unsupervised_text_keybert"
150
+ )
151
+ text = clean_text(text)
152
+ n = st.number_input(
153
+ "Enter number of tags to assign", value=10, key="unsupervised_n_keybert"
154
+ )
155
+ ngram_range_lower = st.number_input(
156
+ "Enter lower limit of ngram range",
157
+ value=1,
158
+ min_value=1,
159
+ max_value=6,
160
+ key="unsupervised_ngram_lower",
161
+ )
162
+ ngram_range_upper = st.number_input(
163
+ "Enter upper limit of ngram range",
164
+ value=3,
165
+ min_value=1,
166
+ max_value=6,
167
+ key="unsupervised_ngram_upper",
168
+ )
169
+ ngram_range = (ngram_range_lower, ngram_range_upper)
170
+ if st.button("Assign tags", key="unsupervised_button_keybert"):
171
+ topics = retrieve_keyphrases(text, n, ngram_range)
172
+ topics = [topic[0] for topic in topics]
173
+ tagger_component("Tags:", topics, color_name="red")
174
+
175
+
176
+ # Display the model page of the app
177
+ def model_page():
178
+ st.title("Select a model to use:")
179
+ tab1, tab2, tab3 = st.tabs(
180
+ ["Supervised Using CNN", "Unsupervised-BERTopic", "Unsupervised-KeyBERT"]
181
+ )
182
+ with tab1:
183
+ supervised_page()
184
+ with tab2:
185
+ unsupervised_page_bertopic()
186
+ with tab3:
187
+ unsupervised_page_keybert()
188
+
189
+
190
+ if __name__ == "__main__":
191
+ model_page()
Pages/3_πŸ‘‹_About.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+
4
+ # Display the about page of the app with information about the creator, code, and data
5
+ def about_page():
6
+ st.header("About")
7
+ st.write(
8
+ "This app was created by [Harshit Singh](https://harsh502s.github.io), Poorvi Singh and Samruddhi Raskar as a part of their MSc Data Science 3rd semester project."
9
+ )
10
+ st.write("The code for this app can be found [here]( ).")
11
+ st.write(
12
+ "The data on which these models are trained can be found [here](https://www.kaggle.com/datasets/harsh502s/stackexchange-tag-dataset)."
13
+ )
14
+ st.subheader("Models used in this app are:")
15
+ st.write(
16
+ "1. [BERTopic](https://maartengr.github.io/BERTopic/api/bertopic.html#:~:text=BERTopic%20is%20a%20topic%20modeling,words%20in%20the%20topic%20descriptions.)"
17
+ )
18
+ st.write(
19
+ "2. [KeyBERT](https://maartengr.github.io/KeyBERT/#:~:text=KeyBERT%20is%20a%20minimal%20and,most%20similar%20to%20a%20document.)"
20
+ )
21
+ st.write(
22
+ "3. [CNN](https://www.tensorflow.org/tutorials/text/text_classification_rnn)"
23
+ )
24
+ pass
25
+
26
+
27
+ if __name__ == "__main__":
28
+ about_page()
app.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from st_pages import Page, show_pages
3
+
4
+ # should be
5
+ show_pages(
6
+ [
7
+ Page(r"app.py", "Home", "🏠"),
8
+ Page(r"Pages/1_πŸ“Š_Topic Model Results.py", 'Topic Model Result',"πŸ“Š"),
9
+ Page(r"Pages/2_πŸ€–_Models.py", "Models", "πŸ€–"),
10
+ Page(r"Pages/3_πŸ‘‹_About.py", "About", "πŸ‘‹"),
11
+ ]
12
+ )
13
+
14
+ st.set_page_config(
15
+ page_title="Autonomous Text Tagging App",
16
+ page_icon="πŸ“",
17
+ layout="wide",
18
+ initial_sidebar_state="expanded",
19
+ )
20
+
21
+
22
+ # Display the main page of the app with instructions on how to use it
23
+ def main():
24
+ st.title("Autonomous Text Tagging App")
25
+ st.subheader(
26
+ "This application shows a demo of different supervised and unsupervised approches taken in the field of NLP to give relevant tags to the text."
27
+ )
28
+ st.subheader("This is a multi-page app.")
29
+ st.write("1. You can navigate between pages by clicking on the sidebar.")
30
+ st.write("2. The Topic Modeling Results page shows the results of BERTopic.")
31
+ st.write("3. The Model page give a demo of all the models used in this app.")
32
+ st.write("4. The About page gives information about the creator, code, and data.")
33
+ st.divider()
34
+
35
+
36
+ if __name__ == "__main__":
37
+ main()
requirements.txt ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.0.0
2
+ altair==5.1.2
3
+ asttokens==2.4.0
4
+ astunparse==1.6.3
5
+ attrs==23.1.0
6
+ backcall==0.2.0
7
+ beautifulsoup4==4.12.2
8
+ bertopic==0.15.0
9
+ blinker==1.6.3
10
+ cachetools==5.3.1
11
+ certifi==2023.7.22
12
+ charset-normalizer==3.3.0
13
+ click==8.1.7
14
+ colorama==0.4.6
15
+ comm==0.1.4
16
+ contourpy==1.1.1
17
+ cycler==0.12.1
18
+ Cython==0.29.36
19
+ debugpy==1.8.0
20
+ decorator==5.1.1
21
+ entrypoints==0.4
22
+ exceptiongroup==1.1.3
23
+ executing==2.0.0
24
+ Faker==19.10.0
25
+ fastjsonschema==2.18.1
26
+ favicon==0.7.0
27
+ filelock==3.12.4
28
+ flatbuffers==23.5.26
29
+ fonttools==4.43.1
30
+ fsspec==2023.9.2
31
+ gast==0.5.4
32
+ gitdb==4.0.10
33
+ GitPython==3.1.37
34
+ google-auth==2.23.3
35
+ google-auth-oauthlib==1.0.0
36
+ google-pasta==0.2.0
37
+ grpcio==1.59.0
38
+ h5py==3.10.0
39
+ hdbscan==0.8.33
40
+ htbuilder==0.6.2
41
+ huggingface-hub==0.17.3
42
+ idna==3.4
43
+ importlib-metadata==6.8.0
44
+ ipykernel==6.25.2
45
+ ipython==8.16.1
46
+ jedi==0.19.1
47
+ Jinja2==3.1.2
48
+ joblib==1.3.2
49
+ jsonschema==4.19.1
50
+ jsonschema-specifications==2023.7.1
51
+ jupyter_client==8.3.1
52
+ jupyter_core==5.3.2
53
+ keras==2.14.0
54
+ keybert==0.8.3
55
+ kiwisolver==1.4.5
56
+ libclang==16.0.6
57
+ llvmlite==0.41.0
58
+ lxml==4.9.3
59
+ Markdown==3.5
60
+ markdown-it-py==3.0.0
61
+ markdownlit==0.0.7
62
+ MarkupSafe==2.1.3
63
+ matplotlib==3.8.0
64
+ matplotlib-inline==0.1.6
65
+ mdurl==0.1.2
66
+ ml-dtypes==0.2.0
67
+ more-itertools==10.1.0
68
+ mpmath==1.3.0
69
+ nbformat==5.9.2
70
+ nest-asyncio==1.5.8
71
+ networkx==3.1
72
+ nltk==3.8.1
73
+ numba==0.58.0
74
+ numpy==1.25.2
75
+ oauthlib==3.2.2
76
+ opt-einsum==3.3.0
77
+ packaging==23.2
78
+ pandas==2.1.1
79
+ parso==0.8.3
80
+ pickleshare==0.7.5
81
+ Pillow==10.0.1
82
+ platformdirs==3.11.0
83
+ plotly==5.17.0
84
+ prompt-toolkit==3.0.39
85
+ protobuf==4.24.4
86
+ psutil==5.9.5
87
+ pure-eval==0.2.2
88
+ pyarrow==13.0.0
89
+ pyasn1==0.5.0
90
+ pyasn1-modules==0.3.0
91
+ pydeck==0.8.1b0
92
+ Pygments==2.16.1
93
+ pymdown-extensions==10.3
94
+ pynndescent==0.5.10
95
+ pyparsing==3.1.1
96
+ python-dateutil==2.8.2
97
+ pytz==2023.3.post1
98
+ PyYAML==6.0.1
99
+ pyzmq==25.1.1
100
+ referencing==0.30.2
101
+ regex==2023.10.3
102
+ requests==2.31.0
103
+ requests-oauthlib==1.3.1
104
+ rich==13.6.0
105
+ rpds-py==0.10.4
106
+ rsa==4.9
107
+ safetensors==0.4.0
108
+ scikit-learn==1.2.2
109
+ scipy==1.11.3
110
+ seaborn==0.13.0
111
+ sentence-transformers==2.2.2
112
+ sentencepiece==0.1.99
113
+ six==1.16.0
114
+ smmap==5.0.1
115
+ soupsieve==2.5
116
+ st-annotated-text==4.0.1
117
+ st-pages==0.4.5
118
+ stack-data==0.6.3
119
+ streamlit==1.27.2
120
+ streamlit-camera-input-live==0.2.0
121
+ streamlit-card==0.0.61
122
+ streamlit-embedcode==0.1.2
123
+ streamlit-extras==0.3.4
124
+ streamlit-faker==0.0.2
125
+ streamlit-image-coordinates==0.1.6
126
+ streamlit-keyup==0.2.0
127
+ streamlit-tags==1.2.8
128
+ streamlit-toggle-switch==1.0.2
129
+ streamlit-vertical-slider==1.0.2
130
+ sympy==1.12
131
+ tenacity==8.2.3
132
+ tensorboard==2.14.1
133
+ tensorboard-data-server==0.7.1
134
+ tensorflow==2.14.0
135
+ tensorflow-estimator==2.14.0
136
+ tensorflow-io-gcs-filesystem==0.31.0
137
+ termcolor==2.3.0
138
+ threadpoolctl==3.2.0
139
+ tokenizers==0.14.1
140
+ toml==0.10.2
141
+ toolz==0.12.0
142
+ torch==2.1.0
143
+ torchvision==0.16.0
144
+ tornado==6.3.3
145
+ tqdm==4.66.1
146
+ traitlets==5.11.2
147
+ transformers==4.34.0
148
+ typing_extensions==4.8.0
149
+ tzdata==2023.3
150
+ tzlocal==5.1
151
+ umap-learn==0.5.4
152
+ urllib3==2.0.6
153
+ validators==0.22.0
154
+ watchdog==3.0.0
155
+ wcwidth==0.2.8
156
+ Werkzeug==3.0.0
157
+ wrapt==1.14.1
158
+ zipp==3.17.0