gtech13 commited on
Commit
6716ee9
Β·
verified Β·
1 Parent(s): 88b9f3e

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +363 -0
app.py ADDED
@@ -0,0 +1,363 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ # --- ANTI-CRASH ENVIRONMENT VARIABLES ---
3
+ os.environ["OMP_NUM_THREADS"] = "1"
4
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
5
+
6
+ import streamlit as st
7
+ import pandas as pd
8
+ import warnings
9
+ import numpy as np
10
+ import matplotlib.pyplot as plt
11
+ import seaborn as sns
12
+
13
+ from bertopic import BERTopic
14
+ from bertopic.representation import MaximalMarginalRelevance, KeyBERTInspired
15
+ from sentence_transformers import SentenceTransformer, models
16
+ from sklearn.feature_extraction.text import CountVectorizer
17
+ from sklearn.decomposition import PCA
18
+ from sklearn.cluster import KMeans
19
+ from sklearn.metrics import silhouette_score
20
+ from umap import UMAP
21
+ from hdbscan import HDBSCAN
22
+ import gensim.corpora as corpora
23
+ from gensim.models.coherencemodel import CoherenceModel
24
+
25
+ warnings.filterwarnings("ignore")
26
+
27
+ # ==========================================
28
+ # 1. PAGE CONFIGURATION & MAPPINGS
29
+ # ==========================================
30
+ st.set_page_config(page_title="Topic Modeling Pipeline", layout="wide", initial_sidebar_state="collapsed")
31
+
32
+ EMBEDDING_MAP = {
33
+ "MiniLM (Fast & Lightweight)": "sentence-transformers/all-MiniLM-L6-v2",
34
+ "MPNet (High Accuracy)": "sentence-transformers/all-mpnet-base-v2",
35
+ "Specter2 (Scientific/Academic)": "allenai/specter2_base"
36
+ }
37
+
38
+ POOLING_MAP = {
39
+ "Mean (Smooth context)": "mean",
40
+ "Max (Sharp keywords)": "max",
41
+ "CLS (Classification)": "cls",
42
+ "Mean-Max (Combined)": "mean-max"
43
+ }
44
+
45
+ # --- CACHE THE NEURAL NETWORK ---
46
+ @st.cache_resource
47
+ def load_embedder(model_name, pool_strat):
48
+ word_emb = models.Transformer(model_name)
49
+ pool_model = models.Pooling(
50
+ word_emb.get_word_embedding_dimension(),
51
+ pooling_mode_mean_tokens=("mean" in pool_strat),
52
+ pooling_mode_max_tokens=("max" in pool_strat),
53
+ pooling_mode_cls_token=("cls" in pool_strat)
54
+ )
55
+ return SentenceTransformer(modules=[word_emb, pool_model])
56
+
57
+ # ==========================================
58
+ # 2. THE GUIDED UI (MAIN PAGE)
59
+ # ==========================================
60
+ st.title("🧠 BERTopic Topic Modeling Pipeline")
61
+
62
+ try:
63
+ st.image("pipeline.png", use_container_width=True)
64
+ except FileNotFoundError:
65
+ pass
66
+
67
+ st.divider()
68
+
69
+ # --- STEP 0: DATA SETTINGS ---
70
+ st.header("πŸ“₯ Step 0: Input Data & Core Settings")
71
+ data_source = st.radio("Choose Data Source:", ["Use Sample ACM Abstract", "Paste Text"], horizontal=True)
72
+
73
+ sample_abstract = """
74
+ Students who registered for the Mapping with Google massive open online course (MOOC)
75
+ were asked several questions during the registration process to identify prior
76
+ experience with eleven skills as well as their goals for registering for the course.
77
+ At the end of the course, we compared students' self reports of goal achievement
78
+ with behavioral click-stream analysis. In addition, we assessed how well prior
79
+ skill in a subject predicts a student's course completion and found no correlation.
80
+ Our research shows that students who completed course activities were more likely
81
+ to earn certificates of completion than peers who did not.
82
+ """
83
+
84
+ raw_data = st.text_area("Text Data:", value=sample_abstract if data_source == "Use Sample ACM Abstract" else "", height=150)
85
+
86
+ col_a, col_b = st.columns(2)
87
+ with col_a:
88
+ n_themes = st.slider("Target Number of Themes", 2, 20, 3)
89
+ with col_b:
90
+ words_per_theme = st.slider("Words to Output per Theme", 3, 10, 5)
91
+
92
+ # --- THE VERTICAL CONFIGURATION WIZARD ---
93
+ st.header("βš™οΈ Model Configuration")
94
+
95
+ with st.expander("1️⃣ Semantic Layer (Embeddings & Pooling)", expanded=True):
96
+ ui_embedding = st.selectbox("Embedding Model", list(EMBEDDING_MAP.keys()))
97
+ ui_pooling = st.selectbox("Pooling Strategy", list(POOLING_MAP.keys()))
98
+
99
+ with st.expander("2️⃣ Geometry Layer (Dimensionality Reduction)", expanded=True):
100
+ ui_algo = st.selectbox("Algorithm", ["UMAP (Complex geometry)", "PCA (Fast/Deterministic)"])
101
+ if "UMAP" in ui_algo:
102
+ ui_metric = st.selectbox("Distance Metric", ["cosine", "euclidean", "manhattan"])
103
+ else:
104
+ ui_metric = "euclidean"
105
+ st.info("PCA inherently uses Variance (Euclidean math), so distance metrics are bypassed.")
106
+
107
+ with st.expander("3️⃣ Clustering Layer (Grouping)", expanded=True):
108
+ st.markdown("""
109
+ *Clustering mathematically draws boundaries around similar sentences.*
110
+ * **Primary Engine (HDBSCAN):** Runs on datasets $\ge$ 15 sentences. Automatically filters outliers and finds dense semantic clouds.
111
+ *(Defaults: min_cluster_size=10, cluster_selection_method='eom', metric='euclidean')*
112
+ * **Fallback Engine (K-Means):** Runs on datasets $<$ 15 sentences. Forces all sentences into buckets to prevent math crashes on tiny text samples.
113
+ *(Defaults: n_clusters = Target Themes, random_state=42)*
114
+ """)
115
+
116
+ with st.expander("4️⃣ Vocabulary Layer (Vectorization)", expanded=True):
117
+ ngram_range = st.slider("N-Gram Range", 1, 3, (1, 2), help="1=Unigrams, 2=Bigrams (e.g., 'machine learning')")
118
+ # Added the explanation here!
119
+ auto_noise = st.checkbox(
120
+ "Auto-Remove Redundant Noise (max_df)",
121
+ value=True,
122
+ help="Mathematically deletes words that appear in more than 85% of the documents."
123
+ )
124
+ st.caption("Deletes overly common words (like 'paper' or 'study') that appear everywhere, preventing generic filler from dominating your themes.")
125
+
126
+ with st.expander("5️⃣ Extraction Layer (Representation)", expanded=True):
127
+ ui_extraction = st.selectbox("Strategy", ["c-TF-IDF (Word frequency)", "KeyBERTInspired (Semantic cosine)", "MMR (Reduce redundancy)"])
128
+ if "MMR" in ui_extraction:
129
+ mmr_diversity = st.slider("MMR Diversity Penalty", 0.0, 1.0, 0.3)
130
+ else:
131
+ mmr_diversity = None
132
+
133
+ # --- EVALUATION METRICS ---
134
+ st.header("πŸ“Š Evaluation Metrics")
135
+ eval_metrics = st.multiselect(
136
+ "Select KPIs to generate a final report card:",
137
+ ["Topic Diversity", "NPMI Coherence", "Silhouette Score"],
138
+ default=["Topic Diversity", "NPMI Coherence", "Silhouette Score"]
139
+ )
140
+
141
+ st.divider()
142
+
143
+ # ==========================================
144
+ # 3. ENGINE EXECUTION
145
+ # ==========================================
146
+ if st.button("πŸš€ Run Topic Modeling Pipeline", type="primary", use_container_width=True):
147
+
148
+ if not raw_data or len(raw_data) < 20:
149
+ st.error("Please provide more text data!")
150
+ st.stop()
151
+
152
+ # --- MATH EXECUTION (Inside Spinner) ---
153
+ with st.spinner("Processing Semantic Pipeline... (Models are cached to prevent crashes)"):
154
+
155
+ sentences = [s.strip() for s in raw_data.split('.') if len(s.strip()) > 10]
156
+ dataset_size = len(sentences)
157
+
158
+ academic_noise = ['students', 'course', 'research', 'paper', 'found', 'likely', 'did']
159
+ from sklearn.feature_extraction import text
160
+ stop_w = list(text.ENGLISH_STOP_WORDS.union(academic_noise))
161
+
162
+ vectorizer_model = CountVectorizer(stop_words=stop_w, ngram_range=ngram_range, max_df=0.85 if auto_noise and dataset_size > 10 else 1.0)
163
+
164
+ custom_embedder = load_embedder(EMBEDDING_MAP[ui_embedding], POOLING_MAP[ui_pooling])
165
+ embeddings = custom_embedder.encode(sentences)
166
+
167
+ # Fallback Logic (Step 3 representation in code)
168
+ is_fallback = False
169
+ if dataset_size < 15 or "PCA" in ui_algo:
170
+ safe_n_themes = min(n_themes, dataset_size)
171
+ dim_model = PCA(n_components=2, random_state=42)
172
+ cluster_model = KMeans(n_clusters=safe_n_themes, random_state=42)
173
+ reduce_topics = None
174
+ is_fallback = True
175
+ algo_used = "PCA"
176
+ cluster_algo = "K-Means"
177
+ else:
178
+ dim_model = UMAP(n_neighbors=15, n_components=5, metric=ui_metric, random_state=42)
179
+ clustering_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom')
180
+ reduce_topics = n_themes
181
+ algo_used = "UMAP"
182
+ cluster_algo = "HDBSCAN"
183
+
184
+ # Representation
185
+ if "MMR" in ui_extraction:
186
+ rep_model = MaximalMarginalRelevance(diversity=mmr_diversity, top_n_words=words_per_theme)
187
+ elif "KeyBERT" in ui_extraction:
188
+ rep_model = KeyBERTInspired(top_n_words=words_per_theme)
189
+ else:
190
+ rep_model = None
191
+
192
+ topic_model = BERTopic(
193
+ embedding_model=custom_embedder,
194
+ umap_model=dim_model,
195
+ hdbscan_model=cluster_model,
196
+ vectorizer_model=vectorizer_model,
197
+ representation_model=rep_model,
198
+ nr_topics=reduce_topics,
199
+ top_n_words=words_per_theme,
200
+ language="english"
201
+ )
202
+ topics, _ = topic_model.fit_transform(sentences)
203
+
204
+ # ==========================================
205
+ # 4. UI DISPLAY & METRICS (Outside Spinner)
206
+ # ==========================================
207
+ st.success("Analysis Complete!")
208
+
209
+ if is_fallback:
210
+ if safe_n_themes < n_themes:
211
+ st.warning(f"⚠️ **Reduced requested themes from {n_themes} to {safe_n_themes}.**\n\n"
212
+ f"*The Math Explanation:* BERTopic clusters complete sentences to preserve context. "
213
+ f"You cannot sort {dataset_size} sentences into {n_themes} buckets without leaving empty buckets, "
214
+ f"which mathematically breaks the clustering algorithms!")
215
+ else:
216
+ st.info(f"ℹ️ Auto-switched to PCA/K-Means due to small dataset size ({dataset_size} sentences).")
217
+
218
+ st.markdown("### πŸ† Discovered Themes")
219
+ topic_info = topic_model.get_topic_info()
220
+ all_words = []
221
+
222
+ cols = st.columns(3)
223
+ col_idx = 0
224
+ for t_id in topic_info['Topic']:
225
+ if t_id == -1: continue
226
+ theme_w = [w[0] for w in topic_model.get_topic(t_id)]
227
+ all_words.append(theme_w)
228
+ with cols[col_idx % 3]:
229
+ st.info(f"**Theme {t_id + 1}**\n\n" + "\n".join([f"πŸ”Ή {w}" for w in theme_w]))
230
+ col_idx += 1
231
+
232
+ # --- METRICS CALCULATIONS ---
233
+ div_val, npmi_val, sil_val = 0.0, 0.0, 0.0
234
+
235
+ if len(eval_metrics) > 0:
236
+ st.markdown("### πŸ“Š Key Performance Indicators (KPI)")
237
+
238
+ with st.spinner("Calculating mathematical metrics... (NPMI requires building a dictionary and takes a moment)"):
239
+
240
+ for metric in eval_metrics:
241
+ if "Diversity" in metric:
242
+ if len(all_words) > 0:
243
+ u_words = set([w for t in all_words for w in t])
244
+ t_words = sum([len(t) for t in all_words])
245
+ div_val = len(u_words) / t_words if t_words > 0 else 0
246
+ st.metric("Topic Diversity (Target: 1.0)", f"{div_val:.2f}")
247
+ else:
248
+ st.metric("Topic Diversity", "Skipped")
249
+
250
+ elif "NPMI" in metric:
251
+ try:
252
+ tokenized = [vectorizer_model.build_analyzer()(s) for s in sentences]
253
+ dictionary = corpora.Dictionary(tokenized)
254
+ cm = CoherenceModel(topics=all_words, texts=tokenized, dictionary=dictionary, coherence='c_npmi')
255
+ temp_npmi = cm.get_coherence()
256
+ if np.isnan(temp_npmi):
257
+ st.metric("NPMI Coherence", "N/A (Too few words)")
258
+ else:
259
+ npmi_val = float(temp_npmi)
260
+ st.metric("NPMI Coherence (Target: >0.1)", f"{npmi_val:.2f}")
261
+ except Exception:
262
+ st.metric("NPMI Coherence", "Skipped (Data too small)")
263
+
264
+ elif "Silhouette" in metric:
265
+ valid_idx = [i for i, t in enumerate(topics) if t != -1]
266
+ unique_topics = set([topics[i] for i in valid_idx])
267
+
268
+ if 1 < len(unique_topics) < len(valid_idx):
269
+ sil_val = float(silhouette_score(
270
+ np.array([embeddings[i] for i in valid_idx]),
271
+ [topics[i] for i in valid_idx],
272
+ metric='cosine'
273
+ ))
274
+ st.metric("Silhouette Score (Target: >0.0)", f"{sil_val:.2f}")
275
+ else:
276
+ st.metric("Silhouette Score", "Skipped (Themes need β‰₯2 sentences each)")
277
+
278
+ # ==========================================
279
+ # 5. XAI VISUALIZATION GRAPH
280
+ # ==========================================
281
+ st.markdown("### πŸ“ˆ Explainable AI (XAI) Architecture Map")
282
+
283
+ with st.spinner("Rendering Explainable AI Dashboard..."):
284
+ sns.set_theme(style="whitegrid")
285
+ fig = plt.figure(figsize=(16, 14))
286
+ fig.suptitle(f"Topic Modeling Pipeline Analytics\n(Pooling: {ui_pooling.split()[0]} | Rep: {ui_extraction.split()[0]})", fontsize=20, fontweight='bold', y=0.98)
287
+ box_style = dict(boxstyle="round,pad=0.4", facecolor='lightyellow', edgecolor='orange', alpha=0.9)
288
+
289
+ # 1. Embeddings
290
+ ax1 = plt.subplot(3, 2, 1)
291
+ sns.heatmap(embeddings[:, :50], cmap="viridis", cbar=False, ax=ax1)
292
+ ax1.set_title("STEP 1: Embeddings & Pooling", fontsize=13, fontweight='bold')
293
+ ax1.set_ylabel("Sentences")
294
+ ax1.set_xlabel("Vector Dimensions (First 50 shown)")
295
+ ax1.text(0.5, -0.25, f"Math: {ui_embedding.split()[0]} encodes text into 384D.\nPooling '{ui_pooling.split()[0]}' squashes word vectors into 1 sentence vector.",
296
+ fontsize=10, ha='center', va='top', transform=ax1.transAxes, bbox=box_style)
297
+
298
+ # 2. Geometry
299
+ ax2 = plt.subplot(3, 2, 2)
300
+ reduced_embeddings = topic_model.umap_model.transform(embeddings)
301
+ ax2.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c='grey', s=100, alpha=0.6, edgecolor='k')
302
+ ax2.set_title(f"STEP 2: Geometry ({algo_used})", fontsize=13, fontweight='bold')
303
+ ax2.text(0.5, -0.25, f"Math: {algo_used} reduces 384D vectors into a 2D map.\nPlaces similar sentences close together.",
304
+ fontsize=10, ha='center', va='top', transform=ax2.transAxes, bbox=box_style)
305
+
306
+ # 3. Clustering
307
+ ax3 = plt.subplot(3, 2, 3)
308
+ ax3.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=topics, cmap='tab10', s=150, edgecolor='k')
309
+ ax3.set_title(f"STEP 3: Clustering ({cluster_algo})", fontsize=13, fontweight='bold')
310
+ ax3.text(0.5, -0.25, f"Math: {cluster_algo} scans the 2D space to draw boundaries.\nColors represent assigned semantic clusters.",
311
+ fontsize=10, ha='center', va='top', transform=ax3.transAxes, bbox=box_style)
312
+
313
+ # 4. Representation
314
+ ax4 = plt.subplot(3, 2, 4)
315
+ theme_1_data = topic_model.get_topic(0)
316
+ if theme_1_data:
317
+ words = [x[0] for x in theme_1_data][::-1]
318
+ scores = [x[1] for x in theme_1_data][::-1]
319
+ ax4.barh(words, scores, color='coral', edgecolor='black')
320
+ ax4.set_title(f"STEP 4: Topic Representation ({ui_extraction.split()[0]})", fontsize=13, fontweight='bold')
321
+ ax4.text(0.5, -0.25, f"Math: Applies {ui_extraction.split()[0]} to rank vocabulary.\nLonger bars = higher semantic relevance.",
322
+ fontsize=10, ha='center', va='top', transform=ax4.transAxes, bbox=box_style)
323
+ else:
324
+ ax4.text(0.5, 0.5, "Theme not found", ha='center', transform=ax4.transAxes)
325
+
326
+ # 5. KPI Dashboard
327
+ ax5 = plt.subplot(3, 2, 5)
328
+ ax5.axis('off')
329
+ ax5.set_title("STEP 5: Key Performance Indicators (KPI)", fontsize=13, fontweight='bold', y=0.9)
330
+
331
+ div_str = f"{div_val:.2f}" if div_val > 0 else "Skipped"
332
+ npmi_str = f"{npmi_val:.2f}" if npmi_val != 0.0 else "Skipped"
333
+ sil_str = f"{sil_val:.2f}" if sil_val != 0.0 else "Skipped"
334
+
335
+ kpi_text = (
336
+ f"πŸ“Š Topic Diversity: {div_str} (Target: 1.0)\n\n"
337
+ f"🧠 NPMI Coherence: {npmi_str} (Target: >0.1)\n\n"
338
+ f"πŸ“ Silhouette Score: {sil_str} (Target: >0.0)"
339
+ )
340
+ ax5.text(0.5, 0.4, kpi_text, fontsize=12, va='center', ha='center',
341
+ bbox=dict(boxstyle="square,pad=1.5", facecolor='#e6f2ff', edgecolor='#377eb8', lw=2))
342
+ ax5.text(0.5, -0.15, "Math: Since pipeline algorithms don't use 'Training Loss',\nthese KPIs provide the absolute mathematical grade of the topics.",
343
+ fontsize=10, ha='center', va='top', transform=ax5.transAxes, bbox=box_style)
344
+
345
+ # 6. Summary Panel
346
+ ax6 = plt.subplot(3, 2, 6)
347
+ ax6.axis('off')
348
+ summary_text = (
349
+ "=== PIPELINE ARCHITECTURE ===\n\n"
350
+ f"1. Embeddings: {ui_embedding.split()[0]}\n"
351
+ f"2. Pooling: {ui_pooling.split()[0]}\n"
352
+ f"3. N-Grams: {ngram_range}\n"
353
+ f"4. Geometry: {algo_used}\n"
354
+ f"5. Clustering: {cluster_algo}\n"
355
+ f"6. Representation: {ui_extraction.split()[0]}\n\n"
356
+ "This modular pipeline successfully transforms unstructured text\n"
357
+ "into mathematically validated semantic domains."
358
+ )
359
+ ax6.text(0.1, 0.5, summary_text, fontsize=12, va='center', ha='left',
360
+ bbox=dict(boxstyle="square,pad=1", facecolor='#f0f0f0', edgecolor='grey', lw=2))
361
+
362
+ plt.subplots_adjust(hspace=0.6, wspace=0.3)
363
+ st.pyplot(fig)