| import os
|
| os.environ["OMP_NUM_THREADS"] = "1"
|
| os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
|
| import streamlit as st
|
| import pandas as pd
|
| import warnings
|
| import numpy as np
|
| import matplotlib.pyplot as plt
|
| import seaborn as sns
|
|
|
| from bertopic import BERTopic
|
| from bertopic.representation import MaximalMarginalRelevance, KeyBERTInspired
|
| from sentence_transformers import SentenceTransformer, models
|
| from sklearn.feature_extraction.text import CountVectorizer
|
| from sklearn.decomposition import PCA
|
| from sklearn.cluster import KMeans
|
| from sklearn.metrics import silhouette_score
|
| from umap import UMAP
|
| from hdbscan import HDBSCAN
|
| import gensim.corpora as corpora
|
| from gensim.models.coherencemodel import CoherenceModel
|
|
|
| warnings.filterwarnings("ignore")
|
|
|
|
|
|
|
|
|
| st.set_page_config(page_title="Topic Modeling Pipeline", layout="wide", initial_sidebar_state="expanded")
|
|
|
|
|
| st.markdown("""
|
| <style>
|
| /* Force the vertical scrollbar to always show so the page width never changes */
|
| html { overflow-y: scroll; }
|
| /* Prevent horizontal scrolling */
|
| .block-container { max-width: 100%; overflow-x: hidden; }
|
| </style>
|
| """, unsafe_allow_html=True)
|
|
|
| EMBEDDING_MAP = {
|
| "MiniLM (Fast & Lightweight)": "sentence-transformers/all-MiniLM-L6-v2",
|
| "MPNet (High Accuracy)": "sentence-transformers/all-mpnet-base-v2",
|
| "Specter2 (Scientific/Academic)": "allenai/specter2_base"
|
| }
|
|
|
| POOLING_MAP = {
|
| "Mean (Smooth context)": "mean",
|
| "Max (Sharp keywords)": "max",
|
| "CLS (Classification)": "cls",
|
| "Mean-Max (Combined)": "mean-max"
|
| }
|
|
|
|
|
| @st.cache_resource
|
| def load_embedder(model_name, pool_strat):
|
| word_emb = models.Transformer(model_name)
|
| pool_model = models.Pooling(
|
| word_emb.get_word_embedding_dimension(),
|
| pooling_mode_mean_tokens=("mean" in pool_strat),
|
| pooling_mode_max_tokens=("max" in pool_strat),
|
| pooling_mode_cls_token=("cls" in pool_strat)
|
| )
|
|
|
|
|
| return SentenceTransformer(modules=[word_emb, pool_model, models.Normalize()])
|
|
|
|
|
|
|
|
|
| st.title("BERTopic - Topic Modeling Pipeline with Math Visualization")
|
|
|
| try:
|
| st.image("pipeline.png", use_container_width=True)
|
| except FileNotFoundError:
|
| pass
|
|
|
| st.divider()
|
|
|
| st.header("📥 Step 0: Input Data & Core Settings")
|
| st.info("💡 **Preprocessing Note:** You do not need to manually lowercase or strip punctuation. The `CountVectorizer` algorithm and the `Uncased` BERT Neural Networks handle casing and token normalization autonomously at the mathematical level.")
|
|
|
| data_source = st.radio("Choose Data Source:", ["Use Sample ACM Abstract", "Paste Text"], horizontal=True)
|
|
|
| sample_abstract = """
|
| Students who registered for the Mapping with Google massive open online course (MOOC)
|
| were asked several questions during the registration process to identify prior
|
| experience with eleven skills as well as their goals for registering for the course.
|
| At the end of the course, we compared students' self reports of goal achievement
|
| with behavioral click-stream analysis. In addition, we assessed how well prior
|
| skill in a subject predicts a student's course completion and found no correlation.
|
| Our research shows that students who completed course activities were more likely
|
| to earn certificates of completion than peers who did not.
|
| """
|
|
|
| raw_data = st.text_area("Text Data:", value=sample_abstract if data_source == "Use Sample ACM Abstract" else "", height=150)
|
|
|
| col_a, col_b = st.columns(2)
|
| with col_a:
|
| n_themes = st.slider("Target Number of Themes", 2, 20, 3)
|
| with col_b:
|
| words_per_theme = st.slider("Words to Output per Theme", 3, 10, 5)
|
|
|
|
|
| st.header("⚙️ Model Configuration")
|
|
|
| with st.expander("1️⃣ Semantic Layer (Embeddings & Pooling)", expanded=True):
|
| st.markdown("*💡 **BERTopic Default:** Uses `all-MiniLM-L6-v2` with `Mean` pooling.*")
|
| ui_embedding = st.selectbox("Embedding Model (Override Default):", list(EMBEDDING_MAP.keys()))
|
| ui_pooling = st.selectbox("Pooling Strategy (Override Default):", list(POOLING_MAP.keys()))
|
|
|
| with st.expander("2️⃣ Geometry Layer (Dimensionality Reduction)", expanded=True):
|
| st.markdown("*💡 **BERTopic Default:** Uses `UMAP` with `Cosine` distance to reduce 384D to 5D space.*")
|
| ui_algo = st.selectbox("Algorithm", ["UMAP (Complex geometry)", "PCA (Fast/Deterministic)"])
|
| if "UMAP" in ui_algo:
|
| ui_metric = st.selectbox("Distance Metric", ["cosine", "euclidean", "manhattan"])
|
| else:
|
| ui_metric = "euclidean"
|
| st.info("PCA inherently uses Variance (Euclidean math), so distance metrics are bypassed.")
|
|
|
| with st.expander("3️⃣ Clustering Layer (Grouping)", expanded=True):
|
| st.markdown("*💡 **BERTopic Default:** Uses `HDBSCAN` exclusively (which crashes on tiny datasets).*")
|
| st.markdown("""
|
| *The model mathematically draws boundaries around similar sentences. *
|
| * **Primary clustering algorithm (HDBSCAN):** Runs on datasets $\ge$ 15 sentences. Automatically filters outliers and finds dense semantic clouds. *(Defaults: min_cluster_size=10)*
|
| * **Fallback clustering algorithm (K-Means):** Runs on datasets $<$ 15 sentences. Forces all sentences into buckets to prevent math crashes on tiny text samples.
|
| """)
|
|
|
| with st.expander("4️⃣ Vocabulary Layer (Vectorization)", expanded=True):
|
| st.markdown("*💡 **BERTopic Default:** Uses `Unigrams` (1 word) and does **not** filter redundant dataset noise.*")
|
| ngram_range = st.slider("N-Gram Range", 1, 3, (1, 2), help="1=Unigrams, 2=Bigrams (e.g., 'machine learning')")
|
| auto_noise = st.checkbox("Auto-Remove Redundant Noise (max_df)", value=True, help="Mathematically deletes words appearing in >85% of documents.")
|
|
|
| with st.expander("5️⃣ Extraction Layer (Representation)", expanded=True):
|
| st.markdown("*💡 **BERTopic Default:** ALWAYS extracts baseline words using **c-TF-IDF** (Word Frequency).*")
|
| ui_extraction = st.selectbox("Apply Advanced Filter on top of c-TF-IDF:", ["None (Base c-TF-IDF only)", "KeyBERTInspired (Semantic cosine)", "MMR (Reduce redundancy)"])
|
| if "MMR" in ui_extraction:
|
| mmr_diversity = st.slider("MMR Diversity Penalty", 0.0, 1.0, 0.3)
|
| else:
|
| mmr_diversity = None
|
|
|
| st.header("📊 Evaluation Metrics")
|
| eval_metrics = st.multiselect(
|
| "Select KPIs to generate a final report card:",
|
| ["Topic Diversity", "NPMI Coherence", "UMass Coherence", "Silhouette Score"],
|
| default=["Topic Diversity", "NPMI Coherence", "UMass Coherence", "Silhouette Score"]
|
| )
|
|
|
| st.divider()
|
|
|
|
|
|
|
|
|
| if st.button("🚀 Run Topic Modeling Pipeline", type="primary", use_container_width=True):
|
|
|
| if not raw_data or len(raw_data) < 20:
|
| st.error("Please provide more text data!")
|
| st.stop()
|
|
|
| with st.spinner("Processing Semantic Pipeline... (Models are cached to prevent crashes)"):
|
|
|
| sentences = [s.strip() for s in raw_data.split('.') if len(s.strip()) > 10]
|
| dataset_size = len(sentences)
|
|
|
| academic_noise = ['students', 'course', 'research', 'paper', 'found', 'likely', 'did']
|
| from sklearn.feature_extraction import text
|
| stop_w = list(text.ENGLISH_STOP_WORDS.union(academic_noise))
|
|
|
| vectorizer_model = CountVectorizer(stop_words=stop_w, ngram_range=ngram_range, max_df=0.85 if auto_noise and dataset_size > 10 else 1.0)
|
|
|
| custom_embedder = load_embedder(EMBEDDING_MAP[ui_embedding], POOLING_MAP[ui_pooling])
|
| embeddings = custom_embedder.encode(sentences)
|
|
|
| is_fallback = False
|
| if dataset_size < 15 or "PCA" in ui_algo:
|
| safe_n_themes = min(n_themes, dataset_size)
|
| dim_model = PCA(n_components=2, random_state=42)
|
| cluster_model = KMeans(n_clusters=safe_n_themes, random_state=42)
|
| reduce_topics = None
|
| is_fallback = True
|
| algo_used = "PCA"
|
| cluster_algo = "K-Means"
|
| else:
|
| dim_model = UMAP(n_neighbors=15, n_components=5, metric=ui_metric, random_state=42)
|
| clustering_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom')
|
| reduce_topics = n_themes
|
| algo_used = "UMAP"
|
| cluster_algo = "HDBSCAN"
|
|
|
| if "MMR" in ui_extraction:
|
| rep_model = MaximalMarginalRelevance(diversity=mmr_diversity, top_n_words=words_per_theme)
|
| elif "KeyBERT" in ui_extraction:
|
| rep_model = KeyBERTInspired(top_n_words=words_per_theme)
|
| else:
|
| rep_model = None
|
|
|
| topic_model = BERTopic(
|
| embedding_model=custom_embedder,
|
| umap_model=dim_model,
|
| hdbscan_model=cluster_model,
|
| vectorizer_model=vectorizer_model,
|
| representation_model=rep_model,
|
| nr_topics=reduce_topics,
|
| top_n_words=words_per_theme,
|
| language="english"
|
| )
|
| topics, _ = topic_model.fit_transform(sentences)
|
|
|
|
|
|
|
|
|
| st.success("Analysis Complete!")
|
|
|
| if is_fallback:
|
| if safe_n_themes < n_themes:
|
| st.warning(f"⚠️ **Reduced requested themes from {n_themes} to {safe_n_themes}.**\n\n"
|
| f"*Reason:* BERTopic clusters complete sentences to preserve context. "
|
| f"You cannot sort {dataset_size} sentences into {n_themes} buckets without leaving empty buckets, "
|
| f"which mathematically breaks the clustering algorithms!")
|
| else:
|
| st.info(f"ℹ️ Auto-switched to PCA/K-Means due to small dataset size ({dataset_size} sentences).")
|
|
|
| st.markdown("### 🏆 Discovered Themes")
|
| topic_info = topic_model.get_topic_info()
|
| all_words = []
|
|
|
| cols = st.columns(3)
|
| col_idx = 0
|
| for t_id in topic_info['Topic']:
|
| if t_id == -1: continue
|
| theme_w = [w[0] for w in topic_model.get_topic(t_id)]
|
| all_words.append(theme_w)
|
| with cols[col_idx % 3]:
|
| st.info(f"**Theme {t_id + 1}**\n\n" + "\n".join([f"🔹 {w}" for w in theme_w]))
|
| col_idx += 1
|
|
|
|
|
| div_val, npmi_val, umass_val, sil_val = 0.0, 0.0, 0.0, 0.0
|
| div_status, npmi_status, umass_status, sil_status = "Skipped", "Skipped", "Skipped", "Skipped"
|
| u_words_len, t_words_len = 0, 0
|
|
|
| if len(eval_metrics) > 0:
|
| with st.spinner("Calculating mathematical metrics... (NPMI requires building a dictionary and takes a moment)"):
|
|
|
|
|
| if "Topic Diversity" in eval_metrics and len(all_words) > 0:
|
| u_words = set([w for t in all_words for w in t])
|
| t_words = sum([len(t) for t in all_words])
|
| u_words_len, t_words_len = len(u_words), t_words
|
| div_val = float(len(u_words) / t_words) if t_words > 0 else 0.0
|
| div_status = f"{div_val:.2f}"
|
|
|
|
|
| if "NPMI Coherence" in eval_metrics or "UMass Coherence" in eval_metrics:
|
| try:
|
| tokenized = [vectorizer_model.build_analyzer()(s) for s in sentences]
|
| dictionary = corpora.Dictionary(tokenized)
|
|
|
| if "NPMI Coherence" in eval_metrics:
|
| cm_npmi = CoherenceModel(topics=all_words, texts=tokenized, dictionary=dictionary, coherence='c_npmi')
|
| temp_npmi = cm_npmi.get_coherence()
|
| if np.isnan(temp_npmi):
|
| npmi_status = "N/A (Too few words)"
|
| else:
|
| npmi_val = float(temp_npmi)
|
| npmi_status = f"{npmi_val:.2f}"
|
|
|
| if "UMass Coherence" in eval_metrics:
|
| cm_umass = CoherenceModel(topics=all_words, texts=tokenized, dictionary=dictionary, coherence='u_mass')
|
| temp_umass = cm_umass.get_coherence()
|
| if np.isnan(temp_umass):
|
| umass_status = "N/A (Too few words)"
|
| else:
|
| umass_val = float(temp_umass)
|
| umass_status = f"{umass_val:.2f}"
|
| except Exception:
|
| npmi_status = "Skipped (Data too small)"
|
| umass_status = "Skipped (Data too small)"
|
|
|
|
|
| if "Silhouette Score" in eval_metrics:
|
| valid_idx = [i for i, t in enumerate(topics) if t != -1]
|
| unique_topics = set([topics[i] for i in valid_idx])
|
| if 1 < len(unique_topics) < len(valid_idx):
|
| sil_val = float(silhouette_score(
|
| np.array([embeddings[i] for i in valid_idx]),
|
| [topics[i] for i in valid_idx],
|
| metric='cosine'
|
| ))
|
| sil_status = f"{sil_val:.2f}"
|
| else:
|
| sil_status = "Skipped (Themes need ≥2 sentences)"
|
|
|
|
|
| st.markdown("### 📊 Key Performance Indicators (KPI)")
|
| kpi_cols = st.columns(len(eval_metrics))
|
|
|
| for idx, metric in enumerate(eval_metrics):
|
| with kpi_cols[idx]:
|
| if metric == "Topic Diversity":
|
| st.metric(
|
| label="Topic Diversity",
|
| value=div_status,
|
| help="Math: Unique Words / Total Words.\nTarget: 1.0 (No redundant words across themes)."
|
| )
|
| elif metric == "NPMI Coherence":
|
| st.metric(
|
| label="NPMI Coherence",
|
| value=npmi_status,
|
| help="Math: Normalized Pointwise Mutual Information.\nCalculates joint probability of words existing together.\nTarget: >0.1"
|
| )
|
| elif metric == "UMass Coherence":
|
| st.metric(
|
| label="UMass Coherence",
|
| value=umass_status,
|
| help="Math: Internal log-conditional probability.\nEvaluates if words co-occur strictly inside your uploaded dataset.\nTarget: Closer to 0."
|
| )
|
| elif metric == "Silhouette Score":
|
| st.metric(
|
| label="Silhouette Score",
|
| value=sil_status,
|
| help="Math: (b - a) / max(a,b).\nMeasures intra-cluster density (a) vs nearest-cluster distance (b).\nTarget: >0.0"
|
| )
|
|
|
|
|
|
|
| st.markdown("### 📈 Explainable AI (XAI) Architecture Map")
|
|
|
| with st.spinner("Rendering Mathematical Dashboard..."):
|
| sns.set_theme(style="whitegrid")
|
| fig = plt.figure(figsize=(18, 16))
|
|
|
|
|
| pool_title = ui_pooling.split()[0]
|
| rep_title = ui_extraction.split()[0]
|
| fig.suptitle(f"Topic Modeling Mathematical Pipeline\n(Pooling: {pool_title} | Rep: {rep_title})", fontsize=20, fontweight='bold', y=0.98)
|
|
|
|
|
| box_style = dict(boxstyle="round,pad=0.5", facecolor='#f8f9fa', edgecolor='#4b72b8', alpha=0.95, lw=2)
|
|
|
|
|
|
|
|
|
| ax1 = plt.subplot(3, 2, 1)
|
| sns.heatmap(embeddings[:, :50], cmap="viridis", cbar=False, ax=ax1)
|
| ax1.set_title("STEP 1: Embeddings & Pooling", fontsize=13, fontweight='bold')
|
| ax1.set_ylabel("Sentences (Docs)")
|
| ax1.set_xlabel("Vector Dimensions (First 50 shown)")
|
|
|
|
|
| emb_shape = embeddings.shape
|
| emb_sample = np.round(embeddings[0, :5], 3).tolist()
|
|
|
| math_text_1 = (
|
| r"$\mathbf{Math (Mean Pool):} \quad v = \frac{1}{N} \sum_{i=1}^{N} \text{BERT}(w_i)$" + "\n"
|
| f"Matrix Shape: {emb_shape} (Docs x Dims)\n"
|
| f"Doc 1 [Dims 1-5]: {emb_sample}..."
|
| )
|
| ax1.text(0.5, -0.25, math_text_1, fontsize=11, ha='center', va='top', transform=ax1.transAxes, bbox=box_style)
|
|
|
|
|
|
|
|
|
| ax2 = plt.subplot(3, 2, 2)
|
| reduced_embeddings = topic_model.umap_model.transform(embeddings)
|
| ax2.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c='grey', s=100, alpha=0.6, edgecolor='k')
|
| ax2.set_title(f"STEP 2: Geometry ({algo_used})", fontsize=13, fontweight='bold')
|
|
|
|
|
| red_shape = reduced_embeddings.shape
|
| red_sample = np.round(reduced_embeddings[0, :2], 3).tolist()
|
|
|
| eq_2 = r"$\mathbf{Math (PCA):} \quad Z = X \cdot W_{2D}$" if algo_used == "PCA" else r"$\mathbf{Math (UMAP):} \quad \text{Topological Manifold Approx.}$"
|
| math_text_2 = (
|
| f"{eq_2}\n"
|
| f"Matrix Shape: {red_shape} (Docs x 2D Coordinates)\n"
|
| f"Doc 1 Coordinate: [X: {red_sample[0]}, Y: {red_sample[1]}]"
|
| )
|
| ax2.text(0.5, -0.25, math_text_2, fontsize=11, ha='center', va='top', transform=ax2.transAxes, bbox=box_style)
|
|
|
|
|
|
|
|
|
| ax3 = plt.subplot(3, 2, 3)
|
| ax3.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=topics, cmap='tab10', s=150, edgecolor='k')
|
| ax3.set_title(f"STEP 3: Clustering ({cluster_algo})", fontsize=13, fontweight='bold')
|
|
|
|
|
| topic_sample = topics[:5]
|
|
|
| eq_3 = r"$\mathbf{Math (K-Means):} \quad \arg\min_S \sum ||x_i - \mu_c||^2$" if cluster_algo == "K-Means" else r"$\mathbf{Math (HDBSCAN):} \quad \text{Density} = \frac{1}{\text{core\_dist}(x)}$"
|
| math_text_3 = (
|
| f"{eq_3}\n"
|
| f"Output Array Shape: ({len(topics)},) (1 Label per Doc)\n"
|
| f"First {len(topic_sample)} Doc Assignments: {topic_sample}"
|
| )
|
|
|
| ax3.text(0.5, -0.25, math_text_3, fontsize=11, ha='center', va='top', transform=ax3.transAxes, bbox=box_style)
|
|
|
|
|
|
|
|
|
| ax4 = plt.subplot(3, 2, 4)
|
|
|
|
|
| valid_topics = [t for t in topic_model.get_topics().keys() if t != -1]
|
| theme_1_data = topic_model.get_topic(valid_topics[0]) if valid_topics else None
|
|
|
| if theme_1_data:
|
| words = [x[0] for x in theme_1_data][::-1]
|
| scores = [x[1] for x in theme_1_data][::-1]
|
| ax4.barh(words, scores, color='coral', edgecolor='black')
|
| ax4.set_title(f"STEP 4: Topic Representation ({rep_title})", fontsize=13, fontweight='bold')
|
|
|
|
|
| top_word_score = round(scores[-1], 4)
|
| vocab_len = len(vectorizer_model.vocabulary_) if hasattr(vectorizer_model, 'vocabulary_') else 'N/A'
|
|
|
| math_text_4 = (
|
| r"$\mathbf{Math (c-TF-IDF):} \quad W_{t,c} = tf_{t,c} \times \log\left(1 + \frac{A}{df_t}\right)$" + "\n"
|
| f"Global Vocab Extracted: {vocab_len} terms\n"
|
| f"Top Word ('{words[-1]}') Score: {top_word_score}"
|
| )
|
| ax4.text(0.5, -0.25, math_text_4, fontsize=11, ha='center', va='top', transform=ax4.transAxes, bbox=box_style)
|
| else:
|
| ax4.text(0.5, 0.5, "Theme not found", ha='center', transform=ax4.transAxes)
|
|
|
|
|
|
|
|
|
| ax5 = plt.subplot(3, 2, 5)
|
| ax5.axis('off')
|
| ax5.set_title("STEP 5: Post-Hoc Evaluation Formulas", fontsize=13, fontweight='bold', y=0.95)
|
|
|
| dist_used = ui_metric if algo_used == "UMAP" else "euclidean"
|
|
|
|
|
| kpi_math = (
|
| r"$\mathbf{Diversity:} \quad D = \frac{| \text{Unique} |}{| \text{Total} |}$" + f" [Live: {div_status}]\n\n"
|
|
|
| r"$\mathbf{Silhouette:} \quad S = \frac{b - a}{\max(a, b)}$" + f" [Live: {sil_status}]\n\n"
|
|
|
| r"$\mathbf{NPMI:} \quad \frac{\log(P(x,y) / P(x)P(y))}{-\log P(x,y)}$" + f" [Live: {npmi_status}]\n\n"
|
|
|
| r"$\mathbf{UMass:} \quad \log \frac{P(x,y) + \epsilon}{P(x)}$" + f" [Live: {umass_status}]"
|
| )
|
|
|
| ax5.text(0.5, 0.45, kpi_math, fontsize=12, va='center', ha='center',
|
| bbox=dict(boxstyle="square,pad=1.2", facecolor='#e6f2ff', edgecolor='#377eb8', lw=2))
|
|
|
| ax5.text(0.5, -0.15, "Math: UMass measures internal dataset logic. NPMI measures external logic.\nSilhouette measures geometric separation.",
|
| fontsize=10, ha='center', va='top', transform=ax5.transAxes, bbox=box_style)
|
|
|
|
|
|
|
|
|
| ax6 = plt.subplot(3, 2, 6)
|
| ax6.axis('off')
|
| summary_text = (
|
| "=== THE MATRIX TRANSFORMATION LIFECYCLE ===\n\n"
|
| f"1. Raw Text $\\rightarrow$ {emb_shape} Matrix (Dense Meaning)\n"
|
| f"2. {emb_shape} $\\rightarrow$ {red_shape} Matrix (Geometric Compression)\n"
|
| f"3. {red_shape} $\\rightarrow$ ({len(topics)},) Array (Discrete Bucketing)\n"
|
| f"4. ({len(topics)},) $\\rightarrow$ c-TF-IDF Matrix (Word Extraction)\n"
|
| f"5. c-TF-IDF $\\rightarrow$ {words_per_theme} Output Words (Per Theme)\n\n"
|
| "This proves Topic Modeling is a sequence of \ndimensionality reductions and matrix multiplications."
|
| )
|
| ax6.text(0.1, 0.5, summary_text, fontsize=12, va='center', ha='left',
|
| bbox=dict(boxstyle="square,pad=1", facecolor='#f0f0f0', edgecolor='grey', lw=2))
|
|
|
| plt.subplots_adjust(hspace=0.7, wspace=0.3)
|
| st.pyplot(fig, use_container_width=True)
|
|
|