File size: 23,933 Bytes
6716ee9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
932b263
 
 
 
 
 
 
 
 
 
 
6716ee9
 
 
 
 
 
 
 
 
 
 
 
 
 
2723f8a
6716ee9
 
 
 
 
 
 
 
 
2723f8a
 
 
6716ee9
 
a0e5e8f
6716ee9
2d77f14
6716ee9
 
 
 
 
 
 
 
 
a0e5e8f
 
6716ee9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328bfd7
6716ee9
 
328bfd7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2723f8a
 
 
328bfd7
 
 
 
 
 
a0e5e8f
 
328bfd7
 
 
 
 
 
6716ee9
 
 
 
a0e5e8f
 
6716ee9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a0e5e8f
6716ee9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a0e5e8f
6716ee9
 
 
 
 
 
2d77f14
6716ee9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b79d0d8
a0e5e8f
 
b79d0d8
6716ee9
 
 
 
b53e4bf
a0e5e8f
 
 
b79d0d8
a0e5e8f
 
 
b53e4bf
a0e5e8f
b9d5e60
 
 
a0e5e8f
 
 
 
b79d0d8
b53e4bf
b79d0d8
 
 
a0e5e8f
 
 
 
b79d0d8
b53e4bf
b79d0d8
 
 
b9d5e60
b53e4bf
 
6716ee9
b53e4bf
b9d5e60
 
 
 
 
 
3f7617c
 
 
 
 
b53e4bf
a0e5e8f
b79d0d8
b9d5e60
 
 
 
 
 
b53e4bf
 
 
 
 
b9d5e60
b53e4bf
 
 
 
 
a0e5e8f
b53e4bf
 
 
 
 
b9d5e60
b53e4bf
 
 
 
 
b79d0d8
 
6716ee9
 
 
b79d0d8
6716ee9
b79d0d8
932b263
 
 
 
 
b79d0d8
 
 
6716ee9
b79d0d8
 
 
6716ee9
 
 
b79d0d8
6716ee9
b79d0d8
 
 
 
 
 
 
 
 
 
 
6716ee9
b79d0d8
 
 
6716ee9
 
 
 
b79d0d8
 
 
 
 
 
 
 
 
 
 
 
6716ee9
b79d0d8
6716ee9
b79d0d8
6716ee9
 
 
b79d0d8
d8c503d
 
b79d0d8
 
 
 
 
d8c503d
b79d0d8
d8c503d
b79d0d8
6716ee9
b79d0d8
6716ee9
b79d0d8
6716ee9
932b263
 
 
 
 
6716ee9
 
 
 
932b263
b79d0d8
 
 
 
 
 
 
 
 
 
 
6716ee9
 
 
d2b54ae
 
b79d0d8
6716ee9
 
d2b54ae
6716ee9
b79d0d8
 
d2b54ae
b79d0d8
d2b54ae
 
 
b79d0d8
d2b54ae
b79d0d8
d2b54ae
6716ee9
b79d0d8
d2b54ae
 
 
 
 
 
b79d0d8
 
 
6716ee9
 
 
b79d0d8
 
 
 
 
932b263
b79d0d8
6716ee9
 
 
 
b79d0d8
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
import os
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import streamlit as st
import pandas as pd
import warnings
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from bertopic import BERTopic
from bertopic.representation import MaximalMarginalRelevance, KeyBERTInspired
from sentence_transformers import SentenceTransformer, models
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from umap import UMAP
from hdbscan import HDBSCAN
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

warnings.filterwarnings("ignore")

# ==========================================
# 1. PAGE CONFIGURATION & MAPPINGS
# ==========================================
st.set_page_config(page_title="Topic Modeling Pipeline", layout="wide", initial_sidebar_state="expanded")

# Add this right below set_page_config to stop the screen from jumping left/right
st.markdown("""

    <style>

        /* Force the vertical scrollbar to always show so the page width never changes */

        html { overflow-y: scroll; }

        /* Prevent horizontal scrolling */

        .block-container { max-width: 100%; overflow-x: hidden; }

    </style>

    """, unsafe_allow_html=True)

EMBEDDING_MAP = {
    "MiniLM (Fast & Lightweight)": "sentence-transformers/all-MiniLM-L6-v2",
    "MPNet (High Accuracy)": "sentence-transformers/all-mpnet-base-v2",
    "Specter2 (Scientific/Academic)": "allenai/specter2_base"
}

POOLING_MAP = {
    "Mean (Smooth context)": "mean",
    "Max (Sharp keywords)": "max",
    "CLS (Classification)": "cls",
    "Mean-Max (Combined)": "mean-max"
}

# --- CACHE THE NEURAL NETWORK ---
@st.cache_resource
def load_embedder(model_name, pool_strat):
    word_emb = models.Transformer(model_name)
    pool_model = models.Pooling(
        word_emb.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=("mean" in pool_strat),
        pooling_mode_max_tokens=("max" in pool_strat),
        pooling_mode_cls_token=("cls" in pool_strat)
    )
    
    # HARDCODED MATH: Permanently forces L2 Normalization so distance math works flawlessly
    return SentenceTransformer(modules=[word_emb, pool_model, models.Normalize()])

# ==========================================
# 2. THE GUIDED UI
# ==========================================
st.title("BERTopic - Topic Modeling Pipeline with Math Visualization")

try:
    st.image("pipeline.png", use_container_width=True)
except FileNotFoundError:
    pass

st.divider()

st.header("πŸ“₯ Step 0: Input Data & Core Settings")
st.info("πŸ’‘ **Preprocessing Note:** You do not need to manually lowercase or strip punctuation. The `CountVectorizer` algorithm and the `Uncased` BERT Neural Networks handle casing and token normalization autonomously at the mathematical level.")

data_source = st.radio("Choose Data Source:", ["Use Sample ACM Abstract", "Paste Text"], horizontal=True)

sample_abstract = """

Students who registered for the Mapping with Google massive open online course (MOOC) 

were asked several questions during the registration process to identify prior 

experience with eleven skills as well as their goals for registering for the course. 

At the end of the course, we compared students' self reports of goal achievement 

with behavioral click-stream analysis. In addition, we assessed how well prior 

skill in a subject predicts a student's course completion and found no correlation. 

Our research shows that students who completed course activities were more likely 

to earn certificates of completion than peers who did not.

"""

raw_data = st.text_area("Text Data:", value=sample_abstract if data_source == "Use Sample ACM Abstract" else "", height=150)

col_a, col_b = st.columns(2)
with col_a:
    n_themes = st.slider("Target Number of Themes", 2, 20, 3)
with col_b:
    words_per_theme = st.slider("Words to Output per Theme", 3, 10, 5)

# --- THE VERTICAL CONFIGURATION WIZARD ---
st.header("βš™οΈ Model Configuration")

with st.expander("1️⃣ Semantic Layer (Embeddings & Pooling)", expanded=True):
    st.markdown("*πŸ’‘ **BERTopic Default:** Uses `all-MiniLM-L6-v2` with `Mean` pooling.*")
    ui_embedding = st.selectbox("Embedding Model (Override Default):", list(EMBEDDING_MAP.keys()))
    ui_pooling = st.selectbox("Pooling Strategy (Override Default):", list(POOLING_MAP.keys()))

with st.expander("2️⃣ Geometry Layer (Dimensionality Reduction)", expanded=True):
    st.markdown("*πŸ’‘ **BERTopic Default:** Uses `UMAP` with `Cosine` distance to reduce 384D to 5D space.*")
    ui_algo = st.selectbox("Algorithm", ["UMAP (Complex geometry)", "PCA (Fast/Deterministic)"])
    if "UMAP" in ui_algo:
        ui_metric = st.selectbox("Distance Metric", ["cosine", "euclidean", "manhattan"])
    else:
        ui_metric = "euclidean"
        st.info("PCA inherently uses Variance (Euclidean math), so distance metrics are bypassed.")

with st.expander("3️⃣ Clustering Layer (Grouping)", expanded=True):
    st.markdown("*πŸ’‘ **BERTopic Default:** Uses `HDBSCAN` exclusively (which crashes on tiny datasets).*")
    st.markdown("""

    *The model mathematically draws boundaries around similar sentences. *

    * **Primary clustering algorithm (HDBSCAN):** Runs on datasets $\ge$ 15 sentences. Automatically filters outliers and finds dense semantic clouds. *(Defaults: min_cluster_size=10)*

    * **Fallback clustering algorithm (K-Means):** Runs on datasets $<$ 15 sentences. Forces all sentences into buckets to prevent math crashes on tiny text samples.

    """)

with st.expander("4️⃣ Vocabulary Layer (Vectorization)", expanded=True):
    st.markdown("*πŸ’‘ **BERTopic Default:** Uses `Unigrams` (1 word) and does **not** filter redundant dataset noise.*")
    ngram_range = st.slider("N-Gram Range", 1, 3, (1, 2), help="1=Unigrams, 2=Bigrams (e.g., 'machine learning')")
    auto_noise = st.checkbox("Auto-Remove Redundant Noise (max_df)", value=True, help="Mathematically deletes words appearing in >85% of documents.")

with st.expander("5️⃣ Extraction Layer (Representation)", expanded=True):
    st.markdown("*πŸ’‘ **BERTopic Default:** ALWAYS extracts baseline words using **c-TF-IDF** (Word Frequency).*")
    ui_extraction = st.selectbox("Apply Advanced Filter on top of c-TF-IDF:", ["None (Base c-TF-IDF only)", "KeyBERTInspired (Semantic cosine)", "MMR (Reduce redundancy)"])
    if "MMR" in ui_extraction:
        mmr_diversity = st.slider("MMR Diversity Penalty", 0.0, 1.0, 0.3)
    else:
        mmr_diversity = None

st.header("πŸ“Š Evaluation Metrics")
eval_metrics = st.multiselect(
    "Select KPIs to generate a final report card:",
    ["Topic Diversity", "NPMI Coherence", "UMass Coherence", "Silhouette Score"],
    default=["Topic Diversity", "NPMI Coherence", "UMass Coherence", "Silhouette Score"]
)

st.divider()

# ==========================================
# 3. ENGINE EXECUTION
# ==========================================
if st.button("πŸš€ Run Topic Modeling Pipeline", type="primary", use_container_width=True):
    
    if not raw_data or len(raw_data) < 20:
        st.error("Please provide more text data!")
        st.stop()

    with st.spinner("Processing Semantic Pipeline... (Models are cached to prevent crashes)"):
        
        sentences = [s.strip() for s in raw_data.split('.') if len(s.strip()) > 10]
        dataset_size = len(sentences)
        
        academic_noise = ['students', 'course', 'research', 'paper', 'found', 'likely', 'did']
        from sklearn.feature_extraction import text
        stop_w = list(text.ENGLISH_STOP_WORDS.union(academic_noise))
        
        vectorizer_model = CountVectorizer(stop_words=stop_w, ngram_range=ngram_range, max_df=0.85 if auto_noise and dataset_size > 10 else 1.0)
        
        custom_embedder = load_embedder(EMBEDDING_MAP[ui_embedding], POOLING_MAP[ui_pooling])
        embeddings = custom_embedder.encode(sentences)
        
        is_fallback = False
        if dataset_size < 15 or "PCA" in ui_algo:
            safe_n_themes = min(n_themes, dataset_size)
            dim_model = PCA(n_components=2, random_state=42)
            cluster_model = KMeans(n_clusters=safe_n_themes, random_state=42)
            reduce_topics = None
            is_fallback = True
            algo_used = "PCA"
            cluster_algo = "K-Means"
        else:
            dim_model = UMAP(n_neighbors=15, n_components=5, metric=ui_metric, random_state=42)
            clustering_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom')
            reduce_topics = n_themes
            algo_used = "UMAP"
            cluster_algo = "HDBSCAN"

        if "MMR" in ui_extraction:
            rep_model = MaximalMarginalRelevance(diversity=mmr_diversity, top_n_words=words_per_theme)
        elif "KeyBERT" in ui_extraction:
            rep_model = KeyBERTInspired(top_n_words=words_per_theme)
        else:
            rep_model = None

        topic_model = BERTopic(
            embedding_model=custom_embedder,
            umap_model=dim_model,
            hdbscan_model=cluster_model,
            vectorizer_model=vectorizer_model,
            representation_model=rep_model,
            nr_topics=reduce_topics,
            top_n_words=words_per_theme,
            language="english"
        )
        topics, _ = topic_model.fit_transform(sentences)

    # ==========================================
    # 4. UI DISPLAY & METRICS
    # ==========================================
    st.success("Analysis Complete!")
    
    if is_fallback:
        if safe_n_themes < n_themes:
            st.warning(f"⚠️ **Reduced requested themes from {n_themes} to {safe_n_themes}.**\n\n"
                       f"*Reason:* BERTopic clusters complete sentences to preserve context. "
                       f"You cannot sort {dataset_size} sentences into {n_themes} buckets without leaving empty buckets, "
                       f"which mathematically breaks the clustering algorithms!")
        else:
            st.info(f"ℹ️ Auto-switched to PCA/K-Means due to small dataset size ({dataset_size} sentences).")
            
    st.markdown("### πŸ† Discovered Themes")
    topic_info = topic_model.get_topic_info()
    all_words = []
    
    cols = st.columns(3)
    col_idx = 0
    for t_id in topic_info['Topic']:
        if t_id == -1: continue
        theme_w = [w[0] for w in topic_model.get_topic(t_id)]
        all_words.append(theme_w)
        with cols[col_idx % 3]:
            st.info(f"**Theme {t_id + 1}**\n\n" + "\n".join([f"πŸ”Ή {w}" for w in theme_w]))
        col_idx += 1

# --- METRICS CALCULATIONS ---
    div_val, npmi_val, umass_val, sil_val = 0.0, 0.0, 0.0, 0.0
    div_status, npmi_status, umass_status, sil_status = "Skipped", "Skipped", "Skipped", "Skipped"
    u_words_len, t_words_len = 0, 0
    
    if len(eval_metrics) > 0:
        with st.spinner("Calculating mathematical metrics... (NPMI requires building a dictionary and takes a moment)"):
            
            # 1. Diversity
            if "Topic Diversity" in eval_metrics and len(all_words) > 0:
                u_words = set([w for t in all_words for w in t])
                t_words = sum([len(t) for t in all_words])
                u_words_len, t_words_len = len(u_words), t_words
                div_val = float(len(u_words) / t_words) if t_words > 0 else 0.0
                div_status = f"{div_val:.2f}"
            
            # 2. Coherence Models (NPMI & UMass)
            if "NPMI Coherence" in eval_metrics or "UMass Coherence" in eval_metrics:
                try:
                    tokenized = [vectorizer_model.build_analyzer()(s) for s in sentences]
                    dictionary = corpora.Dictionary(tokenized)
                    
                    if "NPMI Coherence" in eval_metrics:
                        cm_npmi = CoherenceModel(topics=all_words, texts=tokenized, dictionary=dictionary, coherence='c_npmi')
                        temp_npmi = cm_npmi.get_coherence()
                        if np.isnan(temp_npmi):
                            npmi_status = "N/A (Too few words)"
                        else:
                            npmi_val = float(temp_npmi)
                            npmi_status = f"{npmi_val:.2f}"
                    
                    if "UMass Coherence" in eval_metrics:
                        cm_umass = CoherenceModel(topics=all_words, texts=tokenized, dictionary=dictionary, coherence='u_mass')
                        temp_umass = cm_umass.get_coherence()
                        if np.isnan(temp_umass):
                            umass_status = "N/A (Too few words)"
                        else:
                            umass_val = float(temp_umass)
                            umass_status = f"{umass_val:.2f}"
                except Exception:
                    npmi_status = "Skipped (Data too small)"
                    umass_status = "Skipped (Data too small)"

            # 3. Silhouette Score
            if "Silhouette Score" in eval_metrics:
                valid_idx = [i for i, t in enumerate(topics) if t != -1]
                unique_topics = set([topics[i] for i in valid_idx])
                if 1 < len(unique_topics) < len(valid_idx):
                    sil_val = float(silhouette_score(
                        np.array([embeddings[i] for i in valid_idx]), 
                        [topics[i] for i in valid_idx], 
                        metric='cosine'
                    ))
                    sil_status = f"{sil_val:.2f}"
                else:
                    sil_status = "Skipped (Themes need β‰₯2 sentences)"

        # --- RENDER KPI DASHBOARD WITH TOOLTIPS ---
        st.markdown("### πŸ“Š Key Performance Indicators (KPI)")
        kpi_cols = st.columns(len(eval_metrics))
        
        for idx, metric in enumerate(eval_metrics):
            with kpi_cols[idx]:
                if metric == "Topic Diversity":
                    st.metric(
                        label="Topic Diversity", 
                        value=div_status, 
                        help="Math: Unique Words / Total Words.\nTarget: 1.0 (No redundant words across themes)."
                    )
                elif metric == "NPMI Coherence":
                    st.metric(
                        label="NPMI Coherence", 
                        value=npmi_status, 
                        help="Math: Normalized Pointwise Mutual Information.\nCalculates joint probability of words existing together.\nTarget: >0.1"
                    )
                elif metric == "UMass Coherence":
                    st.metric(
                        label="UMass Coherence", 
                        value=umass_status, 
                        help="Math: Internal log-conditional probability.\nEvaluates if words co-occur strictly inside your uploaded dataset.\nTarget: Closer to 0."
                    )
                elif metric == "Silhouette Score":
                    st.metric(
                        label="Silhouette Score", 
                        value=sil_status, 
                        help="Math: (b - a) / max(a,b).\nMeasures intra-cluster density (a) vs nearest-cluster distance (b).\nTarget: >0.0"
                    )                    
# ==========================================
    # 5. XAI VISUALIZATION GRAPH (With Live Math & Matrices)
    # ==========================================
    st.markdown("### πŸ“ˆ Explainable AI (XAI) Architecture Map")
    
    with st.spinner("Rendering Mathematical Dashboard..."):
        sns.set_theme(style="whitegrid")
        fig = plt.figure(figsize=(18, 16)) 
        
        # Safe extraction for the title
        pool_title = ui_pooling.split()[0]
        rep_title = ui_extraction.split()[0]
        fig.suptitle(f"Topic Modeling Mathematical Pipeline\n(Pooling: {pool_title} | Rep: {rep_title})", fontsize=20, fontweight='bold', y=0.98)
        
        # Style for the Math/Data boxes
        box_style = dict(boxstyle="round,pad=0.5", facecolor='#f8f9fa', edgecolor='#4b72b8', alpha=0.95, lw=2)

        # --------------------------------------------------
        # 1. Embeddings & Pooling
        # --------------------------------------------------
        ax1 = plt.subplot(3, 2, 1)
        sns.heatmap(embeddings[:, :50], cmap="viridis", cbar=False, ax=ax1)
        ax1.set_title("STEP 1: Embeddings & Pooling", fontsize=13, fontweight='bold')
        ax1.set_ylabel("Sentences (Docs)")
        ax1.set_xlabel("Vector Dimensions (First 50 shown)")
        
        # Live Data Extraction
        emb_shape = embeddings.shape
        emb_sample = np.round(embeddings[0, :5], 3).tolist() # First 5 numbers of Doc 1
        
        math_text_1 = (
            r"$\mathbf{Math (Mean Pool):} \quad v = \frac{1}{N} \sum_{i=1}^{N} \text{BERT}(w_i)$" + "\n"
            f"Matrix Shape: {emb_shape} (Docs x Dims)\n"
            f"Doc 1 [Dims 1-5]: {emb_sample}..."
        )
        ax1.text(0.5, -0.25, math_text_1, fontsize=11, ha='center', va='top', transform=ax1.transAxes, bbox=box_style)

        # --------------------------------------------------
        # 2. Geometry (Dimensionality Reduction)
        # --------------------------------------------------
        ax2 = plt.subplot(3, 2, 2)
        reduced_embeddings = topic_model.umap_model.transform(embeddings)
        ax2.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c='grey', s=100, alpha=0.6, edgecolor='k')
        ax2.set_title(f"STEP 2: Geometry ({algo_used})", fontsize=13, fontweight='bold')
        
        # Live Data Extraction
        red_shape = reduced_embeddings.shape
        red_sample = np.round(reduced_embeddings[0, :2], 3).tolist() # X, Y coord of Doc 1
        
        eq_2 = r"$\mathbf{Math (PCA):} \quad Z = X \cdot W_{2D}$" if algo_used == "PCA" else r"$\mathbf{Math (UMAP):} \quad \text{Topological Manifold Approx.}$"
        math_text_2 = (
            f"{eq_2}\n"
            f"Matrix Shape: {red_shape} (Docs x 2D Coordinates)\n"
            f"Doc 1 Coordinate: [X: {red_sample[0]}, Y: {red_sample[1]}]"
        )
        ax2.text(0.5, -0.25, math_text_2, fontsize=11, ha='center', va='top', transform=ax2.transAxes, bbox=box_style)

        # --------------------------------------------------
        # 3. Clustering
        # --------------------------------------------------
        ax3 = plt.subplot(3, 2, 3)
        ax3.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=topics, cmap='tab10', s=150, edgecolor='k')
        ax3.set_title(f"STEP 3: Clustering ({cluster_algo})", fontsize=13, fontweight='bold')
        
# Live Data Extraction
        topic_sample = topics[:5] # Grabs up to the first 5
        
        eq_3 = r"$\mathbf{Math (K-Means):} \quad \arg\min_S \sum ||x_i - \mu_c||^2$" if cluster_algo == "K-Means" else r"$\mathbf{Math (HDBSCAN):} \quad \text{Density} = \frac{1}{\text{core\_dist}(x)}$"
        math_text_3 = (
            f"{eq_3}\n"
            f"Output Array Shape: ({len(topics)},) (1 Label per Doc)\n"
            f"First {len(topic_sample)} Doc Assignments: {topic_sample}"
        )
        
        ax3.text(0.5, -0.25, math_text_3, fontsize=11, ha='center', va='top', transform=ax3.transAxes, bbox=box_style)

        # --------------------------------------------------
        # 4. Representation
        # --------------------------------------------------
        ax4 = plt.subplot(3, 2, 4)
        
        # Safely grab the first valid topic found
        valid_topics = [t for t in topic_model.get_topics().keys() if t != -1]
        theme_1_data = topic_model.get_topic(valid_topics[0]) if valid_topics else None

        if theme_1_data:
            words = [x[0] for x in theme_1_data][::-1] 
            scores = [x[1] for x in theme_1_data][::-1]
            ax4.barh(words, scores, color='coral', edgecolor='black')
            ax4.set_title(f"STEP 4: Topic Representation ({rep_title})", fontsize=13, fontweight='bold')
            
            # Live Data Extraction
            top_word_score = round(scores[-1], 4)
            vocab_len = len(vectorizer_model.vocabulary_) if hasattr(vectorizer_model, 'vocabulary_') else 'N/A'
            
            math_text_4 = (
                r"$\mathbf{Math (c-TF-IDF):} \quad W_{t,c} = tf_{t,c} \times \log\left(1 + \frac{A}{df_t}\right)$" + "\n"
                f"Global Vocab Extracted: {vocab_len} terms\n"
                f"Top Word ('{words[-1]}') Score: {top_word_score}"
            )
            ax4.text(0.5, -0.25, math_text_4, fontsize=11, ha='center', va='top', transform=ax4.transAxes, bbox=box_style)
        else:
            ax4.text(0.5, 0.5, "Theme not found", ha='center', transform=ax4.transAxes)

# --------------------------------------------------
        # 5. KPI Dashboard (Updated with UMass)
        # --------------------------------------------------
        ax5 = plt.subplot(3, 2, 5)
        ax5.axis('off') 
        ax5.set_title("STEP 5: Post-Hoc Evaluation Formulas", fontsize=13, fontweight='bold', y=0.95)
        
        dist_used = ui_metric if algo_used == "UMAP" else "euclidean"
        
        # Condensed to fit all 4 metrics beautifully!
        kpi_math = (
            r"$\mathbf{Diversity:} \quad D = \frac{| \text{Unique} |}{| \text{Total} |}$" + f"  [Live: {div_status}]\n\n"
            
            r"$\mathbf{Silhouette:} \quad S = \frac{b - a}{\max(a, b)}$" + f"  [Live: {sil_status}]\n\n"
            
            r"$\mathbf{NPMI:} \quad \frac{\log(P(x,y) / P(x)P(y))}{-\log P(x,y)}$" + f"  [Live: {npmi_status}]\n\n"
            
            r"$\mathbf{UMass:} \quad \log \frac{P(x,y) + \epsilon}{P(x)}$" + f"  [Live: {umass_status}]"
        )
        
        ax5.text(0.5, 0.45, kpi_math, fontsize=12, va='center', ha='center', 
                 bbox=dict(boxstyle="square,pad=1.2", facecolor='#e6f2ff', edgecolor='#377eb8', lw=2))
        
        ax5.text(0.5, -0.15, "Math: UMass measures internal dataset logic. NPMI measures external logic.\nSilhouette measures geometric separation.", 
                 fontsize=10, ha='center', va='top', transform=ax5.transAxes, bbox=box_style)
        
        # --------------------------------------------------
        # 6. Summary Matrix Transformations
        # --------------------------------------------------
        ax6 = plt.subplot(3, 2, 6)
        ax6.axis('off') 
        summary_text = (
            "=== THE MATRIX TRANSFORMATION LIFECYCLE ===\n\n"
            f"1. Raw Text $\\rightarrow$ {emb_shape} Matrix (Dense Meaning)\n"
            f"2. {emb_shape} $\\rightarrow$ {red_shape} Matrix (Geometric Compression)\n"
            f"3. {red_shape} $\\rightarrow$ ({len(topics)},) Array (Discrete Bucketing)\n"
            f"4. ({len(topics)},) $\\rightarrow$ c-TF-IDF Matrix (Word Extraction)\n"
            f"5. c-TF-IDF $\\rightarrow$ {words_per_theme} Output Words (Per Theme)\n\n"  # THE BUG FIX!
            "This proves Topic Modeling is a sequence of \ndimensionality reductions and matrix multiplications."
        )
        ax6.text(0.1, 0.5, summary_text, fontsize=12, va='center', ha='left', 
                 bbox=dict(boxstyle="square,pad=1", facecolor='#f0f0f0', edgecolor='grey', lw=2))

        plt.subplots_adjust(hspace=0.7, wspace=0.3)
        st.pyplot(fig, use_container_width=True)