Spaces:

gtech13
/

Topic-Modeling-BERTopic-Math-Visualization

Sleeping

Topic-Modeling-BERTopic-Math-Visualization

File size: 23,933 Bytes

6716ee9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
932b263
 
 
 
 
 
 
 
 
 
 
6716ee9
 
 
 
 
 
 
 
 
 
 
 
 
 
2723f8a
6716ee9
 
 
 
 
 
 
 
 
2723f8a
 
 
6716ee9
 
a0e5e8f
6716ee9
2d77f14
6716ee9
 
 
 
 
 
 
 
 
a0e5e8f
 
6716ee9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328bfd7
6716ee9
 
328bfd7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2723f8a
 
 
328bfd7
 
 
 
 
 
a0e5e8f
 
328bfd7
 
 
 
 
 
6716ee9
 
 
 
a0e5e8f
 
6716ee9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a0e5e8f
6716ee9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a0e5e8f
6716ee9
 
 
 
 
 
2d77f14
6716ee9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b79d0d8
a0e5e8f
 
b79d0d8
6716ee9
 
 
 
b53e4bf
a0e5e8f
 
 
b79d0d8
a0e5e8f
 
 
b53e4bf
a0e5e8f
b9d5e60
 
 
a0e5e8f
 
 
 
b79d0d8
b53e4bf
b79d0d8
 
 
a0e5e8f
 
 
 
b79d0d8
b53e4bf
b79d0d8
 
 
b9d5e60
b53e4bf
 
6716ee9
b53e4bf
b9d5e60
 
 
 
 
 
3f7617c
 
 
 
 
b53e4bf
a0e5e8f
b79d0d8
b9d5e60
 
 
 
 
 
b53e4bf
 
 
 
 
b9d5e60
b53e4bf
 
 
 
 
a0e5e8f
b53e4bf
 
 
 
 
b9d5e60
b53e4bf
 
 
 
 
b79d0d8
 
6716ee9
 
 
b79d0d8
6716ee9
b79d0d8
932b263
 
 
 
 
b79d0d8
 
 
6716ee9
b79d0d8
 
 
6716ee9
 
 
b79d0d8
6716ee9
b79d0d8
 
 
 
 
 
 
 
 
 
 
6716ee9
b79d0d8
 
 
6716ee9
 
 
 
b79d0d8
 
 
 
 
 
 
 
 
 
 
 
6716ee9
b79d0d8
6716ee9
b79d0d8
6716ee9
 
 
b79d0d8
d8c503d
 
b79d0d8
 
 
 
 
d8c503d
b79d0d8
d8c503d
b79d0d8
6716ee9
b79d0d8
6716ee9
b79d0d8
6716ee9
932b263
 
 
 
 
6716ee9
 
 
 
932b263
b79d0d8
 
 
 
 
 
 
 
 
 
 
6716ee9
 
 
d2b54ae
 
b79d0d8
6716ee9
 
d2b54ae
6716ee9
b79d0d8
 
d2b54ae
b79d0d8
d2b54ae
 
 
b79d0d8
d2b54ae
b79d0d8
d2b54ae
6716ee9
b79d0d8
d2b54ae
 
 
 
 
 
b79d0d8
 
 
6716ee9
 
 
b79d0d8
 
 
 
 
932b263
b79d0d8
6716ee9
 
 
 
b79d0d8

import os
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import streamlit as st
import pandas as pd
import warnings
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from bertopic import BERTopic
from bertopic.representation import MaximalMarginalRelevance, KeyBERTInspired
from sentence_transformers import SentenceTransformer, models
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from umap import UMAP
from hdbscan import HDBSCAN
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

warnings.filterwarnings("ignore")

# ==========================================
# 1. PAGE CONFIGURATION & MAPPINGS
# ==========================================
st.set_page_config(page_title="Topic Modeling Pipeline", layout="wide", initial_sidebar_state="expanded")

# Add this right below set_page_config to stop the screen from jumping left/right
st.markdown("""

    <style>

        /* Force the vertical scrollbar to always show so the page width never changes */

        html { overflow-y: scroll; }

        /* Prevent horizontal scrolling */

        .block-container { max-width: 100%; overflow-x: hidden; }

    </style>

    """, unsafe_allow_html=True)

EMBEDDING_MAP = {
    "MiniLM (Fast & Lightweight)": "sentence-transformers/all-MiniLM-L6-v2",
    "MPNet (High Accuracy)": "sentence-transformers/all-mpnet-base-v2",
    "Specter2 (Scientific/Academic)": "allenai/specter2_base"
}

POOLING_MAP = {
    "Mean (Smooth context)": "mean",
    "Max (Sharp keywords)": "max",
    "CLS (Classification)": "cls",
    "Mean-Max (Combined)": "mean-max"
}

# --- CACHE THE NEURAL NETWORK ---
@st.cache_resource
def load_embedder(model_name, pool_strat):
    word_emb = models.Transformer(model_name)
    pool_model = models.Pooling(
        word_emb.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=("mean" in pool_strat),
        pooling_mode_max_tokens=("max" in pool_strat),
        pooling_mode_cls_token=("cls" in pool_strat)
    )
    
    # HARDCODED MATH: Permanently forces L2 Normalization so distance math works flawlessly
    return SentenceTransformer(modules=[word_emb, pool_model, models.Normalize()])

# ==========================================
# 2. THE GUIDED UI
# ==========================================
st.title("BERTopic - Topic Modeling Pipeline with Math Visualization")

try:
    st.image("pipeline.png", use_container_width=True)
except FileNotFoundError:
    pass

st.divider()

st.header("📥 Step 0: Input Data & Core Settings")
st.info("💡 **Preprocessing Note:** You do not need to manually lowercase or strip punctuation. The `CountVectorizer` algorithm and the `Uncased` BERT Neural Networks handle casing and token normalization autonomously at the mathematical level.")

data_source = st.radio("Choose Data Source:", ["Use Sample ACM Abstract", "Paste Text"], horizontal=True)

sample_abstract = """

Students who registered for the Mapping with Google massive open online course (MOOC) 

were asked several questions during the registration process to identify prior 

experience with eleven skills as well as their goals for registering for the course. 

At the end of the course, we compared students' self reports of goal achievement 

with behavioral click-stream analysis. In addition, we assessed how well prior 

skill in a subject predicts a student's course completion and found no correlation. 

Our research shows that students who completed course activities were more likely 

to earn certificates of completion than peers who did not.

"""

raw_data = st.text_area("Text Data:", value=sample_abstract if data_source == "Use Sample ACM Abstract" else "", height=150)

col_a, col_b = st.columns(2)
with col_a:
    n_themes = st.slider("Target Number of Themes", 2, 20, 3)
with col_b:
    words_per_theme = st.slider("Words to Output per Theme", 3, 10, 5)

# --- THE VERTICAL CONFIGURATION WIZARD ---
st.header("⚙️ Model Configuration")

with st.expander("1️⃣ Semantic Layer (Embeddings & Pooling)", expanded=True):
    st.markdown("*💡 **BERTopic Default:** Uses `all-MiniLM-L6-v2` with `Mean` pooling.*")
    ui_embedding = st.selectbox("Embedding Model (Override Default):", list(EMBEDDING_MAP.keys()))
    ui_pooling = st.selectbox("Pooling Strategy (Override Default):", list(POOLING_MAP.keys()))

with st.expander("2️⃣ Geometry Layer (Dimensionality Reduction)", expanded=True):
    st.markdown("*💡 **BERTopic Default:** Uses `UMAP` with `Cosine` distance to reduce 384D to 5D space.*")
    ui_algo = st.selectbox("Algorithm", ["UMAP (Complex geometry)", "PCA (Fast/Deterministic)"])
    if "UMAP" in ui_algo:
        ui_metric = st.selectbox("Distance Metric", ["cosine", "euclidean", "manhattan"])
    else:
        ui_metric = "euclidean"
        st.info("PCA inherently uses Variance (Euclidean math), so distance metrics are bypassed.")

with st.expander("3️⃣ Clustering Layer (Grouping)", expanded=True):
    st.markdown("*💡 **BERTopic Default:** Uses `HDBSCAN` exclusively (which crashes on tiny datasets).*")
    st.markdown("""

    *The model mathematically draws boundaries around similar sentences. *

    * **Primary clustering algorithm (HDBSCAN):** Runs on datasets $\ge$ 15 sentences. Automatically filters outliers and finds dense semantic clouds. *(Defaults: min_cluster_size=10)*

    * **Fallback clustering algorithm (K-Means):** Runs on datasets $<$ 15 sentences. Forces all sentences into buckets to prevent math crashes on tiny text samples.

    """)

with st.expander("4️⃣ Vocabulary Layer (Vectorization)", expanded=True):
    st.markdown("*💡 **BERTopic Default:** Uses `Unigrams` (1 word) and does **not** filter redundant dataset noise.*")
    ngram_range = st.slider("N-Gram Range", 1, 3, (1, 2), help="1=Unigrams, 2=Bigrams (e.g., 'machine learning')")
    auto_noise = st.checkbox("Auto-Remove Redundant Noise (max_df)", value=True, help="Mathematically deletes words appearing in >85% of documents.")

with st.expander("5️⃣ Extraction Layer (Representation)", expanded=True):
    st.markdown("*💡 **BERTopic Default:** ALWAYS extracts baseline words using **c-TF-IDF** (Word Frequency).*")
    ui_extraction = st.selectbox("Apply Advanced Filter on top of c-TF-IDF:", ["None (Base c-TF-IDF only)", "KeyBERTInspired (Semantic cosine)", "MMR (Reduce redundancy)"])
    if "MMR" in ui_extraction:
        mmr_diversity = st.slider("MMR Diversity Penalty", 0.0, 1.0, 0.3)
    else:
        mmr_diversity = None

st.header("📊 Evaluation Metrics")
eval_metrics = st.multiselect(
    "Select KPIs to generate a final report card:",
    ["Topic Diversity", "NPMI Coherence", "UMass Coherence", "Silhouette Score"],
    default=["Topic Diversity", "NPMI Coherence", "UMass Coherence", "Silhouette Score"]
)

st.divider()

# ==========================================
# 3. ENGINE EXECUTION
# ==========================================
if st.button("🚀 Run Topic Modeling Pipeline", type="primary", use_container_width=True):
    
    if not raw_data or len(raw_data) < 20:
        st.error("Please provide more text data!")
        st.stop()

    with st.spinner("Processing Semantic Pipeline... (Models are cached to prevent crashes)"):
        
        sentences = [s.strip() for s in raw_data.split('.') if len(s.strip()) > 10]
        dataset_size = len(sentences)
        
        academic_noise = ['students', 'course', 'research', 'paper', 'found', 'likely', 'did']
        from sklearn.feature_extraction import text
        stop_w = list(text.ENGLISH_STOP_WORDS.union(academic_noise))
        
        vectorizer_model = CountVectorizer(stop_words=stop_w, ngram_range=ngram_range, max_df=0.85 if auto_noise and dataset_size > 10 else 1.0)
        
        custom_embedder = load_embedder(EMBEDDING_MAP[ui_embedding], POOLING_MAP[ui_pooling])
        embeddings = custom_embedder.encode(sentences)
        
        is_fallback = False
        if dataset_size < 15 or "PCA" in ui_algo:
            safe_n_themes = min(n_themes, dataset_size)
            dim_model = PCA(n_components=2, random_state=42)
            cluster_model = KMeans(n_clusters=safe_n_themes, random_state=42)
            reduce_topics = None
            is_fallback = True
            algo_used = "PCA"
            cluster_algo = "K-Means"
        else:
            dim_model = UMAP(n_neighbors=15, n_components=5, metric=ui_metric, random_state=42)
            clustering_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom')
            reduce_topics = n_themes
            algo_used = "UMAP"
            cluster_algo = "HDBSCAN"

        if "MMR" in ui_extraction:
            rep_model = MaximalMarginalRelevance(diversity=mmr_diversity, top_n_words=words_per_theme)
        elif "KeyBERT" in ui_extraction:
            rep_model = KeyBERTInspired(top_n_words=words_per_theme)
        else:
            rep_model = None

        topic_model = BERTopic(
            embedding_model=custom_embedder,
            umap_model=dim_model,
            hdbscan_model=cluster_model,
            vectorizer_model=vectorizer_model,
            representation_model=rep_model,
            nr_topics=reduce_topics,
            top_n_words=words_per_theme,
            language="english"
        )
        topics, _ = topic_model.fit_transform(sentences)

    # ==========================================
    # 4. UI DISPLAY & METRICS
    # ==========================================
    st.success("Analysis Complete!")
    
    if is_fallback:
        if safe_n_themes < n_themes:
            st.warning(f"⚠️ **Reduced requested themes from {n_themes} to {safe_n_themes}.**\n\n"
                       f"*Reason:* BERTopic clusters complete sentences to preserve context. "
                       f"You cannot sort {dataset_size} sentences into {n_themes} buckets without leaving empty buckets, "
                       f"which mathematically breaks the clustering algorithms!")
        else:
            st.info(f"ℹ️ Auto-switched to PCA/K-Means due to small dataset size ({dataset_size} sentences).")
            
    st.markdown("### 🏆 Discovered Themes")
    topic_info = topic_model.get_topic_info()
    all_words = []
    
    cols = st.columns(3)
    col_idx = 0
    for t_id in topic_info['Topic']:
        if t_id == -1: continue
        theme_w = [w[0] for w in topic_model.get_topic(t_id)]
        all_words.append(theme_w)
        with cols[col_idx % 3]:
            st.info(f"**Theme {t_id + 1}**\n\n" + "\n".join([f"🔹 {w}" for w in theme_w]))
        col_idx += 1

# --- METRICS CALCULATIONS ---
    div_val, npmi_val, umass_val, sil_val = 0.0, 0.0, 0.0, 0.0
    div_status, npmi_status, umass_status, sil_status = "Skipped", "Skipped", "Skipped", "Skipped"
    u_words_len, t_words_len = 0, 0
    
    if len(eval_metrics) > 0:
        with st.spinner("Calculating mathematical metrics... (NPMI requires building a dictionary and takes a moment)"):
            
            # 1. Diversity
            if "Topic Diversity" in eval_metrics and len(all_words) > 0:
                u_words = set([w for t in all_words for w in t])
                t_words = sum([len(t) for t in all_words])
                u_words_len, t_words_len = len(u_words), t_words
                div_val = float(len(u_words) / t_words) if t_words > 0 else 0.0
                div_status = f"{div_val:.2f}"
            
            # 2. Coherence Models (NPMI & UMass)
            if "NPMI Coherence" in eval_metrics or "UMass Coherence" in eval_metrics:
                try:
                    tokenized = [vectorizer_model.build_analyzer()(s) for s in sentences]
                    dictionary = corpora.Dictionary(tokenized)
                    
                    if "NPMI Coherence" in eval_metrics:
                        cm_npmi = CoherenceModel(topics=all_words, texts=tokenized, dictionary=dictionary, coherence='c_npmi')
                        temp_npmi = cm_npmi.get_coherence()
                        if np.isnan(temp_npmi):
                            npmi_status = "N/A (Too few words)"
                        else:
                            npmi_val = float(temp_npmi)
                            npmi_status = f"{npmi_val:.2f}"
                    
                    if "UMass Coherence" in eval_metrics:
                        cm_umass = CoherenceModel(topics=all_words, texts=tokenized, dictionary=dictionary, coherence='u_mass')
                        temp_umass = cm_umass.get_coherence()
                        if np.isnan(temp_umass):
                            umass_status = "N/A (Too few words)"
                        else:
                            umass_val = float(temp_umass)
                            umass_status = f"{umass_val:.2f}"
                except Exception:
                    npmi_status = "Skipped (Data too small)"
                    umass_status = "Skipped (Data too small)"

            # 3. Silhouette Score
            if "Silhouette Score" in eval_metrics:
                valid_idx = [i for i, t in enumerate(topics) if t != -1]
                unique_topics = set([topics[i] for i in valid_idx])
                if 1 < len(unique_topics) < len(valid_idx):
                    sil_val = float(silhouette_score(
                        np.array([embeddings[i] for i in valid_idx]), 
                        [topics[i] for i in valid_idx], 
                        metric='cosine'
                    ))
                    sil_status = f"{sil_val:.2f}"
                else:
                    sil_status = "Skipped (Themes need ≥2 sentences)"

        # --- RENDER KPI DASHBOARD WITH TOOLTIPS ---
        st.markdown("### 📊 Key Performance Indicators (KPI)")
        kpi_cols = st.columns(len(eval_metrics))
        
        for idx, metric in enumerate(eval_metrics):
            with kpi_cols[idx]:
                if metric == "Topic Diversity":
                    st.metric(
                        label="Topic Diversity", 
                        value=div_status, 
                        help="Math: Unique Words / Total Words.\nTarget: 1.0 (No redundant words across themes)."
                    )
                elif metric == "NPMI Coherence":
                    st.metric(
                        label="NPMI Coherence", 
                        value=npmi_status, 
                        help="Math: Normalized Pointwise Mutual Information.\nCalculates joint probability of words existing together.\nTarget: >0.1"
                    )
                elif metric == "UMass Coherence":
                    st.metric(
                        label="UMass Coherence", 
                        value=umass_status, 
                        help="Math: Internal log-conditional probability.\nEvaluates if words co-occur strictly inside your uploaded dataset.\nTarget: Closer to 0."
                    )
                elif metric == "Silhouette Score":
                    st.metric(
                        label="Silhouette Score", 
                        value=sil_status, 
                        help="Math: (b - a) / max(a,b).\nMeasures intra-cluster density (a) vs nearest-cluster distance (b).\nTarget: >0.0"
                    )                    
# ==========================================
    # 5. XAI VISUALIZATION GRAPH (With Live Math & Matrices)
    # ==========================================
    st.markdown("### 📈 Explainable AI (XAI) Architecture Map")
    
    with st.spinner("Rendering Mathematical Dashboard..."):
        sns.set_theme(style="whitegrid")
        fig = plt.figure(figsize=(18, 16)) 
        
        # Safe extraction for the title
        pool_title = ui_pooling.split()[0]
        rep_title = ui_extraction.split()[0]
        fig.suptitle(f"Topic Modeling Mathematical Pipeline\n(Pooling: {pool_title} | Rep: {rep_title})", fontsize=20, fontweight='bold', y=0.98)
        
        # Style for the Math/Data boxes
        box_style = dict(boxstyle="round,pad=0.5", facecolor='#f8f9fa', edgecolor='#4b72b8', alpha=0.95, lw=2)

        # --------------------------------------------------
        # 1. Embeddings & Pooling
        # --------------------------------------------------
        ax1 = plt.subplot(3, 2, 1)
        sns.heatmap(embeddings[:, :50], cmap="viridis", cbar=False, ax=ax1)
        ax1.set_title("STEP 1: Embeddings & Pooling", fontsize=13, fontweight='bold')
        ax1.set_ylabel("Sentences (Docs)")
        ax1.set_xlabel("Vector Dimensions (First 50 shown)")
        
        # Live Data Extraction
        emb_shape = embeddings.shape
        emb_sample = np.round(embeddings[0, :5], 3).tolist() # First 5 numbers of Doc 1
        
        math_text_1 = (
            r"$\mathbf{Math (Mean Pool):} \quad v = \frac{1}{N} \sum_{i=1}^{N} \text{BERT}(w_i)$" + "\n"
            f"Matrix Shape: {emb_shape} (Docs x Dims)\n"
            f"Doc 1 [Dims 1-5]: {emb_sample}..."
        )
        ax1.text(0.5, -0.25, math_text_1, fontsize=11, ha='center', va='top', transform=ax1.transAxes, bbox=box_style)

        # --------------------------------------------------
        # 2. Geometry (Dimensionality Reduction)
        # --------------------------------------------------
        ax2 = plt.subplot(3, 2, 2)
        reduced_embeddings = topic_model.umap_model.transform(embeddings)
        ax2.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c='grey', s=100, alpha=0.6, edgecolor='k')
        ax2.set_title(f"STEP 2: Geometry ({algo_used})", fontsize=13, fontweight='bold')
        
        # Live Data Extraction
        red_shape = reduced_embeddings.shape
        red_sample = np.round(reduced_embeddings[0, :2], 3).tolist() # X, Y coord of Doc 1
        
        eq_2 = r"$\mathbf{Math (PCA):} \quad Z = X \cdot W_{2D}$" if algo_used == "PCA" else r"$\mathbf{Math (UMAP):} \quad \text{Topological Manifold Approx.}$"
        math_text_2 = (
            f"{eq_2}\n"
            f"Matrix Shape: {red_shape} (Docs x 2D Coordinates)\n"
            f"Doc 1 Coordinate: [X: {red_sample[0]}, Y: {red_sample[1]}]"
        )
        ax2.text(0.5, -0.25, math_text_2, fontsize=11, ha='center', va='top', transform=ax2.transAxes, bbox=box_style)

        # --------------------------------------------------
        # 3. Clustering
        # --------------------------------------------------
        ax3 = plt.subplot(3, 2, 3)
        ax3.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=topics, cmap='tab10', s=150, edgecolor='k')
        ax3.set_title(f"STEP 3: Clustering ({cluster_algo})", fontsize=13, fontweight='bold')
        
# Live Data Extraction
        topic_sample = topics[:5] # Grabs up to the first 5
        
        eq_3 = r"$\mathbf{Math (K-Means):} \quad \arg\min_S \sum ||x_i - \mu_c||^2$" if cluster_algo == "K-Means" else r"$\mathbf{Math (HDBSCAN):} \quad \text{Density} = \frac{1}{\text{core\_dist}(x)}$"
        math_text_3 = (
            f"{eq_3}\n"
            f"Output Array Shape: ({len(topics)},) (1 Label per Doc)\n"
            f"First {len(topic_sample)} Doc Assignments: {topic_sample}"
        )
        
        ax3.text(0.5, -0.25, math_text_3, fontsize=11, ha='center', va='top', transform=ax3.transAxes, bbox=box_style)

        # --------------------------------------------------
        # 4. Representation
        # --------------------------------------------------
        ax4 = plt.subplot(3, 2, 4)
        
        # Safely grab the first valid topic found
        valid_topics = [t for t in topic_model.get_topics().keys() if t != -1]
        theme_1_data = topic_model.get_topic(valid_topics[0]) if valid_topics else None

        if theme_1_data:
            words = [x[0] for x in theme_1_data][::-1] 
            scores = [x[1] for x in theme_1_data][::-1]
            ax4.barh(words, scores, color='coral', edgecolor='black')
            ax4.set_title(f"STEP 4: Topic Representation ({rep_title})", fontsize=13, fontweight='bold')
            
            # Live Data Extraction
            top_word_score = round(scores[-1], 4)
            vocab_len = len(vectorizer_model.vocabulary_) if hasattr(vectorizer_model, 'vocabulary_') else 'N/A'
            
            math_text_4 = (
                r"$\mathbf{Math (c-TF-IDF):} \quad W_{t,c} = tf_{t,c} \times \log\left(1 + \frac{A}{df_t}\right)$" + "\n"
                f"Global Vocab Extracted: {vocab_len} terms\n"
                f"Top Word ('{words[-1]}') Score: {top_word_score}"
            )
            ax4.text(0.5, -0.25, math_text_4, fontsize=11, ha='center', va='top', transform=ax4.transAxes, bbox=box_style)
        else:
            ax4.text(0.5, 0.5, "Theme not found", ha='center', transform=ax4.transAxes)

# --------------------------------------------------
        # 5. KPI Dashboard (Updated with UMass)
        # --------------------------------------------------
        ax5 = plt.subplot(3, 2, 5)
        ax5.axis('off') 
        ax5.set_title("STEP 5: Post-Hoc Evaluation Formulas", fontsize=13, fontweight='bold', y=0.95)
        
        dist_used = ui_metric if algo_used == "UMAP" else "euclidean"
        
        # Condensed to fit all 4 metrics beautifully!
        kpi_math = (
            r"$\mathbf{Diversity:} \quad D = \frac{| \text{Unique} |}{| \text{Total} |}$" + f"  [Live: {div_status}]\n\n"
            
            r"$\mathbf{Silhouette:} \quad S = \frac{b - a}{\max(a, b)}$" + f"  [Live: {sil_status}]\n\n"
            
            r"$\mathbf{NPMI:} \quad \frac{\log(P(x,y) / P(x)P(y))}{-\log P(x,y)}$" + f"  [Live: {npmi_status}]\n\n"
            
            r"$\mathbf{UMass:} \quad \log \frac{P(x,y) + \epsilon}{P(x)}$" + f"  [Live: {umass_status}]"
        )
        
        ax5.text(0.5, 0.45, kpi_math, fontsize=12, va='center', ha='center', 
                 bbox=dict(boxstyle="square,pad=1.2", facecolor='#e6f2ff', edgecolor='#377eb8', lw=2))
        
        ax5.text(0.5, -0.15, "Math: UMass measures internal dataset logic. NPMI measures external logic.\nSilhouette measures geometric separation.", 
                 fontsize=10, ha='center', va='top', transform=ax5.transAxes, bbox=box_style)
        
        # --------------------------------------------------
        # 6. Summary Matrix Transformations
        # --------------------------------------------------
        ax6 = plt.subplot(3, 2, 6)
        ax6.axis('off') 
        summary_text = (
            "=== THE MATRIX TRANSFORMATION LIFECYCLE ===\n\n"
            f"1. Raw Text $\\rightarrow$ {emb_shape} Matrix (Dense Meaning)\n"
            f"2. {emb_shape} $\\rightarrow$ {red_shape} Matrix (Geometric Compression)\n"
            f"3. {red_shape} $\\rightarrow$ ({len(topics)},) Array (Discrete Bucketing)\n"
            f"4. ({len(topics)},) $\\rightarrow$ c-TF-IDF Matrix (Word Extraction)\n"
            f"5. c-TF-IDF $\\rightarrow$ {words_per_theme} Output Words (Per Theme)\n\n"  # THE BUG FIX!
            "This proves Topic Modeling is a sequence of \ndimensionality reductions and matrix multiplications."
        )
        ax6.text(0.1, 0.5, summary_text, fontsize=12, va='center', ha='left', 
                 bbox=dict(boxstyle="square,pad=1", facecolor='#f0f0f0', edgecolor='grey', lw=2))

        plt.subplots_adjust(hspace=0.7, wspace=0.3)
        st.pyplot(fig, use_container_width=True)