Spaces:

Enayut
/

topic_modelling_agent

Sleeping

App Files Files Community

Enayut commited on about 1 month ago

Commit

ed51280

verified ·

1 Parent(s): f7f4f4c

Upload 7 files

Browse files

Files changed (8) hide show

.gitattributes +1 -0
Dockerfile +41 -0
README.md +10 -0
agent.py +287 -0
app.py +1191 -0
dataset.csv +3 -0
requirements.txt +16 -0
tools.py +626 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+dataset.csv filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,41 @@

+FROM python:3.11-slim
+# Set working directory
+WORKDIR /app
+# Install system dependencies needed by some Python packages
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first (Docker layer caching)
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Download NLTK data at build time so it's baked into the image
+RUN python -c "import nltk; nltk.download('punkt'); nltk.download('punkt_tab'); nltk.download('stopwords')"
+# Copy application code
+COPY app.py .
+COPY agent.py .
+COPY tools.py .
+# Copy dataset (bundled as the default dataset)
+COPY dataset.csv .
+# Create writable outputs directory
+RUN mkdir -p /app/outputs && chmod 777 /app/outputs
+# Expose Streamlit port
+EXPOSE 8501
+# Health check
+HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health || exit 1
+# Run Streamlit
+CMD ["streamlit", "run", "app.py", \
+     "--server.port=8501", \
+     "--server.address=0.0.0.0", \
+     "--server.headless=true", \
+     "--browser.gatherUsageStats=false", \
+     "--server.fileWatcherType=none"]

README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+---
+title: Research Topic Modeler
+emoji: 🔬
+colorFrom: indigo
+colorTo: purple
+sdk: docker
+app_port: 8501
+pinned: false
+license: mit
+---

agent.py ADDED Viewed

	@@ -0,0 +1,287 @@

+"""
+agent.py — TopicAgent orchestrates the end-to-end topic modeling workflow.
+This module defines the TopicAgent class, which:
+  1. Loads and validates the CSV dataset.
+  2. Preprocesses text for Titles and Abstracts separately.
+  3. Runs topic modeling on each corpus (≥100 topics guaranteed).
+  4. Generates human-readable labels for every topic.
+  5. Compares dominant themes across Title and Abstract topics.
+  6. Produces a taxonomy map (MAPPED / NOVEL classification).
+  7. Exports structured outputs: topics table, comparison CSV, taxonomy JSON.
+Usage:
+    agent = TopicAgent(csv_path="dataset.csv")
+    results = agent.run()
+"""
+import os
+import json
+import logging
+from dataclasses import dataclass, field
+from typing import Dict, Any, Optional
+import pandas as pd
+from tools import (
+    load_csv,
+    preprocess_text,
+    run_topic_modeling,
+    generate_labels,
+    compare_themes,
+    create_taxonomy_map,
+)
+# ---------------------------------------------------------------------------
+# Logging
+# ---------------------------------------------------------------------------
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Structured result container
+# ---------------------------------------------------------------------------
+@dataclass
+class AgentResult:
+    """Container for all outputs produced by the TopicAgent."""
+    # Core dataframes
+    title_topics: pd.DataFrame = field(default_factory=pd.DataFrame)
+    abstract_topics: pd.DataFrame = field(default_factory=pd.DataFrame)
+    combined_topics: pd.DataFrame = field(default_factory=pd.DataFrame)
+    comparison: pd.DataFrame = field(default_factory=pd.DataFrame)
+    # Taxonomy map (dict serialisable to JSON)
+    taxonomy_map: Dict[str, Any] = field(default_factory=dict)
+    # Execution metadata
+    status: str = "pending"
+    steps_completed: list = field(default_factory=list)
+    errors: list = field(default_factory=list)
+    # File paths of exported artefacts
+    exported_files: Dict[str, str] = field(default_factory=dict)
+# ---------------------------------------------------------------------------
+# TopicAgent
+# ---------------------------------------------------------------------------
+class TopicAgent:
+    """
+    Orchestrates the research-paper topic modeling pipeline.
+    Parameters
+    ----------
+    csv_path : str
+        Path to the input CSV file.
+    output_dir : str
+        Directory to write output files.
+    min_topics : int
+        Minimum number of topics to generate per source (default 100).
+    use_llm_labels : bool
+        Whether to use Groq LLM for label generation.
+    groq_api_key : str, optional
+        API key for Groq (used only when use_llm_labels is True).
+    """
+    def __init__(
+        self,
+        csv_path: str,
+        output_dir: str = "outputs",
+        min_topics: int = 100,
+        use_llm_labels: bool = False,
+        groq_api_key: Optional[str] = None,
+    ):
+        self.csv_path = csv_path
+        self.output_dir = output_dir
+        self.min_topics = min_topics
+        self.use_llm_labels = use_llm_labels
+        self.groq_api_key = groq_api_key
+        # Ensure output directory exists
+        os.makedirs(self.output_dir, exist_ok=True)
+        self._result = AgentResult()
+    # -----------------------------------------------------------------
+    # Public interface
+    # -----------------------------------------------------------------
+    def run(self) -> AgentResult:
+        """
+        Execute the full pipeline step by step.
+        Returns
+        -------
+        AgentResult
+            Structured results including all DataFrames, taxonomy, and file paths.
+        """
+        logger.info("=" * 60)
+        logger.info("TopicAgent — Starting pipeline")
+        logger.info("=" * 60)
+        try:
+            # Step 1: Load CSV
+            self._step_load_csv()
+            # Step 2: Preprocess text
+            self._step_preprocess()
+            # Step 3: Topic modeling on Titles
+            self._step_model_titles()
+            # Step 4: Topic modeling on Abstracts
+            self._step_model_abstracts()
+            # Step 5: Generate labels
+            self._step_generate_labels()
+            # Step 6: Build combined topics table
+            self._step_combine_topics()
+            # Step 7: Compare themes
+            self._step_compare_themes()
+            # Step 8: Create taxonomy map
+            self._step_taxonomy_map()
+            # Step 9: Export outputs
+            self._step_export()
+            self._result.status = "success"
+            logger.info("Pipeline completed successfully.")
+        except Exception as exc:
+            self._result.status = "failed"
+            self._result.errors.append(str(exc))
+            logger.error("Pipeline failed: %s", exc, exc_info=True)
+        return self._result
+    # -----------------------------------------------------------------
+    # Pipeline steps
+    # -----------------------------------------------------------------
+    def _step_load_csv(self):
+        """Step 1 — Ingest CSV dataset."""
+        logger.info("Step 1/9: Loading CSV …")
+        self._df = load_csv(self.csv_path)
+        self._result.steps_completed.append("load_csv")
+        logger.info("  → %d papers loaded.", len(self._df))
+    def _step_preprocess(self):
+        """Step 2 — Preprocess Title and Abstract text."""
+        logger.info("Step 2/9: Preprocessing text …")
+        self._titles_clean = preprocess_text(self._df["Title"].tolist())
+        self._abstracts_clean = preprocess_text(self._df["Abstract"].tolist())
+        self._result.steps_completed.append("preprocess_text")
+        logger.info("  → Titles preprocessed: %d docs", len(self._titles_clean))
+        logger.info("  → Abstracts preprocessed: %d docs", len(self._abstracts_clean))
+    def _step_model_titles(self):
+        """Step 3 — Topic modeling on Titles."""
+        logger.info("Step 3/9: Topic modeling on Titles …")
+        self._title_topics_df, self._title_model = run_topic_modeling(
+            self._titles_clean,
+            source_label="Titles",
+            min_topics=self.min_topics,
+        )
+        self._result.steps_completed.append("topic_modeling_titles")
+        logger.info("  → %d title topics discovered.", len(self._title_topics_df))
+    def _step_model_abstracts(self):
+        """Step 4 — Topic modeling on Abstracts."""
+        logger.info("Step 4/9: Topic modeling on Abstracts …")
+        self._abstract_topics_df, self._abstract_model = run_topic_modeling(
+            self._abstracts_clean,
+            source_label="Abstracts",
+            min_topics=self.min_topics,
+        )
+        self._result.steps_completed.append("topic_modeling_abstracts")
+        logger.info("  → %d abstract topics discovered.", len(self._abstract_topics_df))
+    def _step_generate_labels(self):
+        """Step 5 — Generate human-readable labels."""
+        logger.info("Step 5/9: Generating topic labels …")
+        self._title_topics_df = generate_labels(
+            self._title_topics_df,
+            use_llm=self.use_llm_labels,
+            groq_api_key=self.groq_api_key,
+        )
+        self._abstract_topics_df = generate_labels(
+            self._abstract_topics_df,
+            use_llm=self.use_llm_labels,
+            groq_api_key=self.groq_api_key,
+        )
+        self._result.title_topics = self._title_topics_df
+        self._result.abstract_topics = self._abstract_topics_df
+        self._result.steps_completed.append("generate_labels")
+        logger.info("  → Labels generated for all topics.")
+    def _step_combine_topics(self):
+        """Step 6 — Combine title and abstract topics into one table."""
+        logger.info("Step 6/9: Building combined topics table …")
+        combined = pd.concat(
+            [self._title_topics_df, self._abstract_topics_df],
+            ignore_index=True,
+        )
+        combined["global_id"] = range(len(combined))
+        self._result.combined_topics = combined
+        self._result.steps_completed.append("combine_topics")
+        logger.info("  → Combined table: %d topics total.", len(combined))
+    def _step_compare_themes(self):
+        """Step 7 — Compare title vs abstract themes."""
+        logger.info("Step 7/9: Comparing title vs abstract themes …")
+        comparison = compare_themes(self._title_topics_df, self._abstract_topics_df)
+        self._result.comparison = comparison
+        self._result.steps_completed.append("compare_themes")
+        logger.info("  → Comparison table: %d rows.", len(comparison))
+    def _step_taxonomy_map(self):
+        """Step 8 — Create taxonomy map (MAPPED / NOVEL)."""
+        logger.info("Step 8/9: Building taxonomy map …")
+        # Use the combined topics for taxonomy
+        taxonomy = create_taxonomy_map(self._result.combined_topics)
+        self._result.taxonomy_map = taxonomy
+        self._result.steps_completed.append("create_taxonomy_map")
+        logger.info(
+            "  → MAPPED: %d, NOVEL: %d",
+            taxonomy["metadata"]["mapped_count"],
+            taxonomy["metadata"]["novel_count"],
+        )
+    def _step_export(self):
+        """Step 9 — Export all outputs to disk."""
+        logger.info("Step 9/9: Exporting outputs …")
+        # (a) Combined topics table CSV
+        topics_path = os.path.join(self.output_dir, "topics_table.csv")
+        self._result.combined_topics.to_csv(topics_path, index=False)
+        self._result.exported_files["topics_table"] = topics_path
+        logger.info("  → Saved: %s", topics_path)
+        # (b) Comparison CSV
+        comparison_path = os.path.join(self.output_dir, "comparison.csv")
+        self._result.comparison.to_csv(comparison_path, index=False)
+        self._result.exported_files["comparison"] = comparison_path
+        logger.info("  → Saved: %s", comparison_path)
+        # (c) Taxonomy map JSON
+        taxonomy_path = os.path.join(self.output_dir, "taxonomy_map.json")
+        with open(taxonomy_path, "w", encoding="utf-8") as f:
+            json.dump(self._result.taxonomy_map, f, indent=2, ensure_ascii=False)
+        self._result.exported_files["taxonomy_map"] = taxonomy_path
+        logger.info("  → Saved: %s", taxonomy_path)
+        # (d) Title topics CSV
+        title_path = os.path.join(self.output_dir, "title_topics.csv")
+        self._result.title_topics.to_csv(title_path, index=False)
+        self._result.exported_files["title_topics"] = title_path
+        logger.info("  → Saved: %s", title_path)
+        # (e) Abstract topics CSV
+        abstract_path = os.path.join(self.output_dir, "abstract_topics.csv")
+        self._result.abstract_topics.to_csv(abstract_path, index=False)
+        self._result.exported_files["abstract_topics"] = abstract_path
+        logger.info("  → Saved: %s", abstract_path)
+        self._result.steps_completed.append("export")
+        logger.info("  → All outputs exported successfully.")

app.py ADDED Viewed

	@@ -0,0 +1,1191 @@

+"""
+app.py — Streamlit frontend for the AI-driven Topic Modeling application.
+This module provides an interactive web interface that allows users to:
+  1. Upload a CSV file containing research paper Titles and Abstracts.
+  2. Configure pipeline parameters (min topics, LLM label generation).
+  3. Run the TopicAgent pipeline with a single click.
+  4. View and explore results: topics table, comparison, taxonomy map.
+  5. Review topics with an editable review table.
+  6. Visualize topic distributions with interactive Plotly charts.
+  7. Download all generated outputs (CSV, JSON).
+"""
+import os
+import json
+import tempfile
+import streamlit as st
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+from agent import TopicAgent
+# ---------------------------------------------------------------------------
+# HuggingFace Spaces compatibility: use a writable output directory
+# On HF Spaces the working directory can be read-only, so fall back to /tmp
+# ---------------------------------------------------------------------------
+OUTPUT_DIR = "outputs"
+try:
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    # Test write access
+    _test_path = os.path.join(OUTPUT_DIR, ".write_test")
+    with open(_test_path, "w") as _f:
+        _f.write("ok")
+    os.remove(_test_path)
+except (OSError, PermissionError):
+    OUTPUT_DIR = os.path.join(tempfile.gettempdir(), "topic_modeler_outputs")
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+# ---------------------------------------------------------------------------
+# Page configuration
+# ---------------------------------------------------------------------------
+st.set_page_config(
+    page_title="Research Topic Modeler — AI Agent",
+    page_icon="🔬",
+    layout="wide",
+    initial_sidebar_state="expanded",
+)
+# ---------------------------------------------------------------------------
+# Custom CSS for a polished, professional look with dark-safe text colors
+# ---------------------------------------------------------------------------
+st.markdown("""
+<style>
+    /* Import Google Font */
+    @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
+    /* Global */
+    html, body, [class*="css"] {
+        font-family: 'Inter', sans-serif;
+    }
+    /* Header gradient banner */
+    .main-header {
+        background: linear-gradient(135deg, #0f0c29 0%, #302b63 50%, #24243e 100%);
+        padding: 2rem 2.5rem;
+        border-radius: 16px;
+        margin-bottom: 1.5rem;
+        box-shadow: 0 8px 32px rgba(48, 43, 99, 0.3);
+    }
+    .main-header h1 {
+        color: #ffffff;
+        font-size: 2.2rem;
+        font-weight: 700;
+        margin: 0;
+        letter-spacing: -0.5px;
+    }
+    .main-header p {
+        color: #b8b5ff;
+        font-size: 1.05rem;
+        margin: 0.5rem 0 0 0;
+        font-weight: 300;
+    }
+    /* Stat cards */
+    .stat-card {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        padding: 1.25rem 1.5rem;
+        border-radius: 12px;
+        color: white;
+        text-align: center;
+        box-shadow: 0 4px 15px rgba(102, 126, 234, 0.3);
+        transition: transform 0.2s ease;
+    }
+    .stat-card:hover {
+        transform: translateY(-2px);
+    }
+    .stat-card .stat-value {
+        font-size: 2rem;
+        font-weight: 700;
+        line-height: 1.2;
+        color: #ffffff;
+    }
+    .stat-card .stat-label {
+        font-size: 0.85rem;
+        opacity: 0.85;
+        margin-top: 0.3rem;
+        font-weight: 400;
+        color: #e8e6ff;
+    }
+    /* Status badge */
+    .status-badge {
+        display: inline-block;
+        padding: 0.3rem 1rem;
+        border-radius: 20px;
+        font-size: 0.8rem;
+        font-weight: 600;
+        text-transform: uppercase;
+        letter-spacing: 0.5px;
+    }
+    .status-success {
+        background: linear-gradient(135deg, #11998e, #38ef7d);
+        color: #ffffff;
+    }
+    .status-failed {
+        background: linear-gradient(135deg, #eb3349, #f45c43);
+        color: #ffffff;
+    }
+    .status-running {
+        background: linear-gradient(135deg, #f7971e, #ffd200);
+        color: #1a1a2e;
+    }
+    /* Section headers — always readable on both light and dark backgrounds */
+    .section-header {
+        font-size: 1.3rem;
+        font-weight: 600;
+        color: #c4b5fd;
+        margin: 1.5rem 0 0.75rem 0;
+        padding-bottom: 0.5rem;
+        border-bottom: 2px solid #667eea;
+        display: inline-block;
+    }
+    /* Taxonomy badges */
+    .mapped-badge {
+        display: inline-block;
+        background: linear-gradient(135deg, #11998e, #38ef7d);
+        color: #ffffff;
+        padding: 0.2rem 0.7rem;
+        border-radius: 12px;
+        font-size: 0.75rem;
+        font-weight: 600;
+    }
+    .novel-badge {
+        display: inline-block;
+        background: linear-gradient(135deg, #fc4a1a, #f7b733);
+        color: #ffffff;
+        padding: 0.2rem 0.7rem;
+        border-radius: 12px;
+        font-size: 0.75rem;
+        font-weight: 600;
+    }
+    /* Sidebar styling */
+    section[data-testid="stSidebar"] {
+        background: linear-gradient(180deg, #1a1a2e 0%, #16213e 100%);
+    }
+    section[data-testid="stSidebar"] .stMarkdown {
+        color: #e0e0e0;
+    }
+    section[data-testid="stSidebar"] label {
+        color: #e0e0e0 !important;
+    }
+    section[data-testid="stSidebar"] .stSlider label {
+        color: #e0e0e0 !important;
+    }
+    /* Data table enhancements */
+    .stDataFrame {
+        border-radius: 8px;
+        overflow: hidden;
+    }
+    /* Info box — dark-safe: dark background with light text */
+    .info-box {
+        background: linear-gradient(135deg, #1e1e3f 0%, #2d2b55 100%);
+        padding: 1rem 1.5rem;
+        border-radius: 10px;
+        border-left: 4px solid #667eea;
+        margin: 0.75rem 0;
+        color: #e0e0e0;
+    }
+    .info-box strong {
+        color: #ffffff;
+    }
+    .info-box code {
+        background: rgba(102, 126, 234, 0.2);
+        color: #b8b5ff;
+        padding: 0.1rem 0.4rem;
+        border-radius: 4px;
+    }
+    /* Pipeline step */
+    .step-item {
+        padding: 0.5rem 1rem;
+        margin: 0.3rem 0;
+        border-radius: 8px;
+        background: rgba(102, 126, 234, 0.15);
+        border-left: 3px solid #667eea;
+        font-size: 0.9rem;
+        color: #e0e0e0;
+    }
+    /* Chart container styling */
+    .chart-container {
+        background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
+        border-radius: 12px;
+        padding: 1rem;
+        margin: 0.5rem 0;
+        box-shadow: 0 4px 15px rgba(0, 0, 0, 0.2);
+    }
+    /* Review section header */
+    .review-header {
+        background: linear-gradient(135deg, #11998e 0%, #38ef7d 100%);
+        padding: 1rem 1.5rem;
+        border-radius: 12px;
+        margin-bottom: 1rem;
+        box-shadow: 0 4px 15px rgba(17, 153, 142, 0.3);
+    }
+    .review-header h3 {
+        color: #ffffff;
+        margin: 0;
+        font-weight: 600;
+    }
+    .review-header p {
+        color: #e0fff8;
+        margin: 0.3rem 0 0 0;
+        font-size: 0.9rem;
+    }
+    /* Save confirmation */
+    .save-confirm {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        color: #ffffff;
+        padding: 0.75rem 1.25rem;
+        border-radius: 10px;
+        margin-top: 0.5rem;
+        font-weight: 500;
+    }
+    /* Ensure tab labels are readable */
+    .stTabs [data-baseweb="tab-list"] button {
+        color: #c4b5fd;
+    }
+    .stTabs [data-baseweb="tab-list"] button[aria-selected="true"] {
+        color: #ffffff;
+    }
+</style>
+""", unsafe_allow_html=True)
+# ---------------------------------------------------------------------------
+# Header
+# ---------------------------------------------------------------------------
+st.markdown("""
+<div class="main-header">
+    <h1>🔬 Research Topic Modeler</h1>
+    <p>AI-powered topic modeling agent for research papers — discover, compare, and classify themes across Titles and Abstracts</p>
+</div>
+""", unsafe_allow_html=True)
+# ---------------------------------------------------------------------------
+# Sidebar — Configuration
+# ---------------------------------------------------------------------------
+with st.sidebar:
+    st.markdown("## ⚙️ Configuration")
+    st.markdown("---")
+    # File upload
+    st.markdown("### 📁 Dataset")
+    uploaded_file = st.file_uploader(
+        "Upload CSV with Title & Abstract columns",
+        type=["csv"],
+        help="The CSV must contain at least 'Title' and 'Abstract' columns.",
+    )
+    # Or use default dataset
+    use_default = st.checkbox(
+        "Use default dataset (dataset.csv)",
+        value=True if not uploaded_file else False,
+        help="Use the bundled dataset.csv file in the project directory.",
+    )
+    st.markdown("---")
+    st.markdown("### 🎯 Parameters")
+    min_topics = st.slider(
+        "Minimum Topics",
+        min_value=50,
+        max_value=200,
+        value=100,
+        step=10,
+        help="Minimum number of topics to generate per source (Titles / Abstracts).",
+    )
+    use_llm = st.checkbox(
+        "🤖 Use LLM for Label Generation (Groq)",
+        value=False,
+        help="Use Groq's LLaMA model to generate contextual topic labels. "
+             "Falls back to keyword heuristic if unchecked.",
+    )
+    groq_key = os.environ.get("GROQ_API_KEY", "")
+    if use_llm:
+        groq_key = st.text_input(
+            "Groq API Key",
+            value=groq_key,
+            type="password",
+            help="Your Groq API key for LLM label generation.",
+        )
+    st.markdown("---")
+    st.markdown("### 📋 Pipeline Steps")
+    steps_info = [
+        "1. Load & validate CSV",
+        "2. Preprocess text (Titles + Abstracts)",
+        "3. Topic modeling — Titles (≥{} topics)".format(min_topics),
+        "4. Topic modeling — Abstracts (≥{} topics)".format(min_topics),
+        "5. Generate human-readable labels",
+        "6. Combine topics table",
+        "7. Compare themes (Title vs Abstract)",
+        "8. Build taxonomy map (MAPPED / NOVEL)",
+        "9. Export outputs (CSV, JSON)",
+    ]
+    for step in steps_info:
+        st.markdown(f'<div class="step-item">{step}</div>', unsafe_allow_html=True)
+# ---------------------------------------------------------------------------
+# Main area — Run button and results
+# ---------------------------------------------------------------------------
+col_run, col_status = st.columns([2, 3])
+with col_run:
+    run_clicked = st.button("🚀 Run Topic Modeling Agent", use_container_width=True, type="primary")
+with col_status:
+    if "result" in st.session_state and st.session_state.result is not None:
+        res = st.session_state.result
+        if res.status == "success":
+            st.markdown('<span class="status-badge status-success">✓ Pipeline Complete</span>', unsafe_allow_html=True)
+        elif res.status == "failed":
+            st.markdown('<span class="status-badge status-failed">✗ Pipeline Failed</span>', unsafe_allow_html=True)
+    else:
+        st.markdown('<span class="status-badge status-running">● Awaiting Input</span>', unsafe_allow_html=True)
+# ---------------------------------------------------------------------------
+# Execute pipeline
+# ---------------------------------------------------------------------------
+if run_clicked:
+    # Determine CSV path
+    csv_path = None
+    if uploaded_file is not None:
+        # Save uploaded file to a temp location
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".csv", dir=".") as tmp:
+            tmp.write(uploaded_file.getvalue())
+            csv_path = tmp.name
+    elif use_default:
+        csv_path = "dataset.csv"
+        if not os.path.exists(csv_path):
+            st.error("❌ Default dataset.csv not found in the project directory.")
+            st.stop()
+    else:
+        st.error("❌ Please upload a CSV file or select the default dataset.")
+        st.stop()
+    # Run the agent
+    with st.spinner("🔄 Running the Topic Modeling Agent … this may take a few minutes."):
+        progress = st.progress(0, text="Initializing …")
+        agent = TopicAgent(
+            csv_path=csv_path,
+            output_dir=OUTPUT_DIR,
+            min_topics=min_topics,
+            use_llm_labels=use_llm,
+            groq_api_key=groq_key if use_llm else None,
+        )
+        # Display step-by-step progress
+        progress.progress(5, text="Step 1/9: Loading CSV …")
+        agent._step_load_csv()
+        progress.progress(10, text="Step 2/9: Preprocessing text …")
+        agent._step_preprocess()
+        progress.progress(20, text="Step 3/9: Topic modeling on Titles …")
+        agent._step_model_titles()
+        progress.progress(45, text="Step 4/9: Topic modeling on Abstracts …")
+        agent._step_model_abstracts()
+        progress.progress(65, text="Step 5/9: Generating topic labels …")
+        agent._step_generate_labels()
+        progress.progress(75, text="Step 6/9: Building combined topics table …")
+        agent._step_combine_topics()
+        progress.progress(80, text="Step 7/9: Comparing themes …")
+        agent._step_compare_themes()
+        progress.progress(90, text="Step 8/9: Building taxonomy map …")
+        agent._step_taxonomy_map()
+        progress.progress(95, text="Step 9/9: Exporting outputs …")
+        agent._step_export()
+        agent._result.status = "success"
+        progress.progress(100, text="✅ Pipeline complete!")
+        st.session_state.result = agent._result
+    # Clean up temp file
+    if uploaded_file is not None and csv_path and os.path.exists(csv_path):
+        try:
+            os.unlink(csv_path)
+        except Exception:
+            pass
+    st.rerun()
+# ---------------------------------------------------------------------------
+# Helper: Plotly chart theme (dark background, readable text)
+# ---------------------------------------------------------------------------
+PLOTLY_LAYOUT = dict(
+    paper_bgcolor="rgba(26, 26, 46, 0.95)",
+    plot_bgcolor="rgba(22, 33, 62, 0.95)",
+    font=dict(family="Inter, sans-serif", size=13, color="#e0e0e0"),
+    title_font=dict(size=18, color="#ffffff"),
+    legend=dict(
+        font=dict(color="#e0e0e0"),
+        bgcolor="rgba(26, 26, 46, 0.7)",
+        bordercolor="#667eea",
+        borderwidth=1,
+    ),
+    xaxis=dict(
+        gridcolor="rgba(102, 126, 234, 0.15)",
+        zerolinecolor="rgba(102, 126, 234, 0.25)",
+        tickfont=dict(color="#c4b5fd"),
+        title_font=dict(color="#e0e0e0"),
+    ),
+    yaxis=dict(
+        gridcolor="rgba(102, 126, 234, 0.15)",
+        zerolinecolor="rgba(102, 126, 234, 0.25)",
+        tickfont=dict(color="#c4b5fd"),
+        title_font=dict(color="#e0e0e0"),
+    ),
+    margin=dict(l=60, r=30, t=60, b=60),
+)
+# Gradient-like color sequence
+CHART_COLORS = [
+    "#667eea", "#764ba2", "#f093fb", "#f5576c",
+    "#4facfe", "#00f2fe", "#43e97b", "#38f9d7",
+    "#fa709a", "#fee140", "#a18cd1", "#fbc2eb",
+    "#ff9a9e", "#fad0c4", "#ffecd2", "#fcb69f",
+]
+# ---------------------------------------------------------------------------
+# Display results
+# ---------------------------------------------------------------------------
+if "result" in st.session_state and st.session_state.result is not None:
+    result = st.session_state.result
+    if result.status == "failed":
+        st.error(f"Pipeline failed with errors: {result.errors}")
+        st.stop()
+    # ---- Summary Statistics ----
+    st.markdown('<div class="section-header">📊 Summary Statistics</div>', unsafe_allow_html=True)
+    c1, c2, c3, c4, c5 = st.columns(5)
+    with c1:
+        st.markdown(f"""
+        <div class="stat-card">
+            <div class="stat-value">{len(result.title_topics)}</div>
+            <div class="stat-label">Title Topics</div>
+        </div>
+        """, unsafe_allow_html=True)
+    with c2:
+        st.markdown(f"""
+        <div class="stat-card">
+            <div class="stat-value">{len(result.abstract_topics)}</div>
+            <div class="stat-label">Abstract Topics</div>
+        </div>
+        """, unsafe_allow_html=True)
+    with c3:
+        st.markdown(f"""
+        <div class="stat-card">
+            <div class="stat-value">{len(result.combined_topics)}</div>
+            <div class="stat-label">Total Topics</div>
+        </div>
+        """, unsafe_allow_html=True)
+    with c4:
+        mapped_count = result.taxonomy_map.get("metadata", {}).get("mapped_count", 0)
+        st.markdown(f"""
+        <div class="stat-card">
+            <div class="stat-value">{mapped_count}</div>
+            <div class="stat-label">Mapped Themes</div>
+        </div>
+        """, unsafe_allow_html=True)
+    with c5:
+        novel_count = result.taxonomy_map.get("metadata", {}).get("novel_count", 0)
+        st.markdown(f"""
+        <div class="stat-card">
+            <div class="stat-value">{novel_count}</div>
+            <div class="stat-label">Novel Themes</div>
+        </div>
+        """, unsafe_allow_html=True)
+    st.markdown("<br>", unsafe_allow_html=True)
+    # ---- Tabbed Results ----
+    tab1, tab2, tab3, tab4, tab5, tab_review, tab_charts = st.tabs([
+        "📋 Topics Table",
+        "🔬 Title Topics",
+        "📄 Abstract Topics",
+        "⚖️ Theme Comparison",
+        "🗺️ Taxonomy Map",
+        "✏️ Review Table",
+        "📈 Charts",
+    ])
+    # Tab 1: Combined Topics Table
+    with tab1:
+        st.markdown('<div class="section-header">Combined Topics Table</div>', unsafe_allow_html=True)
+        st.markdown(f"Showing all **{len(result.combined_topics)}** topics from both Titles and Abstracts.")
+        # Filter controls
+        fcol1, fcol2 = st.columns(2)
+        with fcol1:
+            source_filter = st.multiselect(
+                "Filter by Source",
+                options=result.combined_topics["source"].unique().tolist(),
+                default=result.combined_topics["source"].unique().tolist(),
+            )
+        with fcol2:
+            search_term = st.text_input("🔍 Search keywords", "")
+        display_df = result.combined_topics[result.combined_topics["source"].isin(source_filter)]
+        if search_term:
+            mask = display_df["keywords"].str.contains(search_term, case=False, na=False)
+            mask |= display_df["label"].str.contains(search_term, case=False, na=False)
+            display_df = display_df[mask]
+        st.dataframe(
+            display_df,
+            use_container_width=True,
+            height=500,
+            column_config={
+                "topic_id": st.column_config.NumberColumn("Topic ID", width="small"),
+                "keywords": st.column_config.TextColumn("Keywords", width="large"),
+                "label": st.column_config.TextColumn("Label", width="medium"),
+                "source": st.column_config.TextColumn("Source", width="small"),
+            },
+        )
+    # Tab 2: Title Topics
+    with tab2:
+        st.markdown('<div class="section-header">Title Topics</div>', unsafe_allow_html=True)
+        st.markdown(f"**{len(result.title_topics)}** topics discovered from paper titles.")
+        st.dataframe(result.title_topics, use_container_width=True, height=500)
+    # Tab 3: Abstract Topics
+    with tab3:
+        st.markdown('<div class="section-header">Abstract Topics</div>', unsafe_allow_html=True)
+        st.markdown(f"**{len(result.abstract_topics)}** topics discovered from paper abstracts.")
+        st.dataframe(result.abstract_topics, use_container_width=True, height=500)
+    # Tab 4: Theme Comparison
+    with tab4:
+        st.markdown('<div class="section-header">Theme Comparison: Titles vs Abstracts</div>', unsafe_allow_html=True)
+        if not result.comparison.empty:
+            # Alignment distribution
+            align_counts = result.comparison["alignment"].value_counts()
+            acol1, acol2, acol3, acol4 = st.columns(4)
+            for col, alignment in zip(
+                [acol1, acol2, acol3, acol4],
+                ["Strong", "Moderate", "Weak", "No Match"],
+            ):
+                with col:
+                    count = align_counts.get(alignment, 0)
+                    st.metric(label=f"{alignment} Alignment", value=count)
+            st.markdown("<br>", unsafe_allow_html=True)
+            # Filter by alignment
+            alignment_filter = st.multiselect(
+                "Filter by Alignment",
+                options=["Strong", "Moderate", "Weak", "No Match"],
+                default=["Strong", "Moderate", "Weak", "No Match"],
+            )
+            filtered_comp = result.comparison[result.comparison["alignment"].isin(alignment_filter)]
+            st.dataframe(
+                filtered_comp,
+                use_container_width=True,
+                height=500,
+                column_config={
+                    "similarity": st.column_config.ProgressColumn(
+                        "Similarity",
+                        min_value=0,
+                        max_value=1,
+                        format="%.2f",
+                    ),
+                },
+            )
+        else:
+            st.info("No comparison data available.")
+    # Tab 5: Taxonomy Map
+    with tab5:
+        st.markdown('<div class="section-header">Taxonomy Map</div>', unsafe_allow_html=True)
+        taxonomy = result.taxonomy_map
+        meta = taxonomy.get("metadata", {})
+        st.markdown(f"""
+        <div class="info-box">
+            <strong>Classification Summary:</strong><br>
+            Total Topics: <strong>{meta.get('total_topics', 0)}</strong> |
+            <span class="mapped-badge">MAPPED: {meta.get('mapped_count', 0)}</span> |
+            <span class="novel-badge">NOVEL: {meta.get('novel_count', 0)}</span> |
+            Threshold: {meta.get('threshold', 0.15)}
+        </div>
+        """, unsafe_allow_html=True)
+        tax_tab1, tax_tab2 = st.tabs(["✅ Mapped Themes", "🆕 Novel Themes"])
+        with tax_tab1:
+            mapped_list = taxonomy.get("mapped", [])
+            if mapped_list:
+                mapped_df = pd.DataFrame(mapped_list)
+                st.dataframe(
+                    mapped_df,
+                    use_container_width=True,
+                    height=400,
+                    column_config={
+                        "score": st.column_config.ProgressColumn(
+                            "Match Score",
+                            min_value=0,
+                            max_value=1,
+                            format="%.3f",
+                        ),
+                    },
+                )
+            else:
+                st.info("No mapped themes found.")
+        with tax_tab2:
+            novel_list = taxonomy.get("novel", [])
+            if novel_list:
+                novel_df = pd.DataFrame(novel_list)
+                st.dataframe(
+                    novel_df,
+                    use_container_width=True,
+                    height=400,
+                    column_config={
+                        "score": st.column_config.ProgressColumn(
+                            "Match Score",
+                            min_value=0,
+                            max_value=1,
+                            format="%.3f",
+                        ),
+                    },
+                )
+            else:
+                st.info("No novel themes found.")
+    # ==================================================================
+    # Tab 6: Editable Review Table
+    # ==================================================================
+    with tab_review:
+        st.markdown("""
+        <div class="review-header">
+            <h3>✏️ Topic Review Table</h3>
+            <p>Review, approve, rename, and annotate each topic. Changes are saved to outputs/review_table.csv.</p>
+        </div>
+        """, unsafe_allow_html=True)
+        # Build review dataframe from combined topics
+        # Load existing review table if available to preserve edits
+        review_csv_path = os.path.join(OUTPUT_DIR, "review_table.csv")
+        if "review_df" not in st.session_state:
+            if os.path.exists(review_csv_path):
+                # Load previously saved review table
+                existing_review = pd.read_csv(review_csv_path)
+                # Merge with current topics to ensure all topics are represented
+                current_ids = set(result.combined_topics["topic_id"].tolist())
+                existing_ids = set(existing_review["topic_id"].tolist()) if "topic_id" in existing_review.columns else set()
+                if current_ids == existing_ids or existing_ids.issuperset(current_ids):
+                    st.session_state.review_df = existing_review
+                else:
+                    # Rebuild from current topics, but preserve existing edits
+                    review_data = []
+                    for _, row in result.combined_topics.iterrows():
+                        review_data.append({
+                            "topic_id": int(row["topic_id"]),
+                            "label": row.get("label", ""),
+                            "keywords": row.get("keywords", ""),
+                            "source": row.get("source", ""),
+                            "approve": False,
+                            "rename_to": "",
+                            "reasoning": "",
+                        })
+                    new_review_df = pd.DataFrame(review_data)
+                    # Merge existing edits
+                    if not existing_review.empty and "topic_id" in existing_review.columns:
+                        for _, erow in existing_review.iterrows():
+                            mask = new_review_df["topic_id"] == erow["topic_id"]
+                            if mask.any():
+                                if "approve" in erow:
+                                    new_review_df.loc[mask, "approve"] = erow["approve"]
+                                if "rename_to" in erow and pd.notna(erow["rename_to"]):
+                                    new_review_df.loc[mask, "rename_to"] = erow["rename_to"]
+                                if "reasoning" in erow and pd.notna(erow["reasoning"]):
+                                    new_review_df.loc[mask, "reasoning"] = erow["reasoning"]
+                    st.session_state.review_df = new_review_df
+            else:
+                # Build fresh review table
+                review_data = []
+                for _, row in result.combined_topics.iterrows():
+                    review_data.append({
+                        "topic_id": int(row["topic_id"]),
+                        "label": row.get("label", ""),
+                        "keywords": row.get("keywords", ""),
+                        "source": row.get("source", ""),
+                        "approve": False,
+                        "rename_to": "",
+                        "reasoning": "",
+                    })
+                st.session_state.review_df = pd.DataFrame(review_data)
+        # Filter controls for review table
+        rv_col1, rv_col2, rv_col3 = st.columns(3)
+        with rv_col1:
+            review_source_filter = st.multiselect(
+                "Filter by Source",
+                options=st.session_state.review_df["source"].unique().tolist(),
+                default=st.session_state.review_df["source"].unique().tolist(),
+                key="review_source_filter",
+            )
+        with rv_col2:
+            review_search = st.text_input("🔍 Search in review table", "", key="review_search")
+        with rv_col3:
+            review_approval_filter = st.selectbox(
+                "Show",
+                options=["All Topics", "Approved Only", "Not Approved"],
+                index=0,
+                key="review_approval_filter",
+            )
+        # Apply filters
+        filtered_review = st.session_state.review_df[
+            st.session_state.review_df["source"].isin(review_source_filter)
+        ]
+        if review_search:
+            search_mask = (
+                filtered_review["keywords"].str.contains(review_search, case=False, na=False) |
+                filtered_review["label"].str.contains(review_search, case=False, na=False)
+            )
+            filtered_review = filtered_review[search_mask]
+        if review_approval_filter == "Approved Only":
+            filtered_review = filtered_review[filtered_review["approve"] == True]
+        elif review_approval_filter == "Not Approved":
+            filtered_review = filtered_review[filtered_review["approve"] == False]
+        # Editable data editor
+        edited_df = st.data_editor(
+            filtered_review,
+            use_container_width=True,
+            height=500,
+            num_rows="fixed",
+            key="review_editor",
+            column_config={
+                "topic_id": st.column_config.NumberColumn(
+                    "Topic ID", width="small", disabled=True
+                ),
+                "label": st.column_config.TextColumn(
+                    "Label", width="medium",
+                ),
+                "keywords": st.column_config.TextColumn(
+                    "Keywords", width="large", disabled=True,
+                ),
+                "source": st.column_config.TextColumn(
+                    "Source", width="small", disabled=True,
+                ),
+                "approve": st.column_config.CheckboxColumn(
+                    "✅ Approve", width="small", default=False,
+                ),
+                "rename_to": st.column_config.TextColumn(
+                    "Rename To", width="medium",
+                ),
+                "reasoning": st.column_config.TextColumn(
+                    "Reasoning / Notes", width="large",
+                ),
+            },
+            column_order=["topic_id", "label", "keywords", "approve", "rename_to", "reasoning", "source"],
+        )
+        # Update session state with edits
+        if edited_df is not None:
+            # Merge edits back into the full review dataframe
+            for idx, erow in edited_df.iterrows():
+                mask = st.session_state.review_df.index == idx
+                if mask.any():
+                    for col in ["label", "approve", "rename_to", "reasoning"]:
+                        if col in erow:
+                            st.session_state.review_df.loc[mask, col] = erow[col]
+        # Save button
+        sv_col1, sv_col2, sv_col3 = st.columns([1, 1, 2])
+        with sv_col1:
+            if st.button("💾 Save Review Table", use_container_width=True, type="primary"):
+                os.makedirs(OUTPUT_DIR, exist_ok=True)
+                st.session_state.review_df.to_csv(review_csv_path, index=False)
+                st.markdown(
+                    '<div class="save-confirm">✅ Review table saved to outputs/review_table.csv</div>',
+                    unsafe_allow_html=True,
+                )
+        with sv_col2:
+            approved_count = int(st.session_state.review_df["approve"].sum()) if "approve" in st.session_state.review_df.columns else 0
+            total_count = len(st.session_state.review_df)
+            st.markdown(f"""
+            <div class="stat-card" style="padding: 0.75rem 1rem;">
+                <div class="stat-value" style="font-size: 1.4rem;">{approved_count}/{total_count}</div>
+                <div class="stat-label">Topics Approved</div>
+            </div>
+            """, unsafe_allow_html=True)
+    # ==================================================================
+    # Tab 7: Charts
+    # ==================================================================
+    with tab_charts:
+        st.markdown('<div class="section-header">📈 Topic Visualizations</div>', unsafe_allow_html=True)
+        # -----------------------------------------------------------
+        # Chart 1: Topic Frequency by Source
+        # -----------------------------------------------------------
+        st.markdown("#### 📊 Topic Frequency by Source")
+        st.caption("Number of topics discovered from each source (Titles vs Abstracts).")
+        source_counts = result.combined_topics["source"].value_counts().reset_index()
+        source_counts.columns = ["Source", "Count"]
+        fig1 = px.bar(
+            source_counts,
+            x="Source",
+            y="Count",
+            color="Source",
+            color_discrete_sequence=["#667eea", "#764ba2"],
+            text="Count",
+        )
+        fig1.update_traces(
+            textposition="outside",
+            textfont=dict(color="#e0e0e0", size=14, family="Inter"),
+            marker=dict(
+                line=dict(width=0),
+            ),
+        )
+        fig1.update_layout(
+            **PLOTLY_LAYOUT,
+            title="Topic Count by Source",
+            xaxis_title="Source",
+            yaxis_title="Number of Topics",
+            showlegend=False,
+            height=420,
+        )
+        st.plotly_chart(fig1, use_container_width=True)
+        st.markdown("---")
+        # -----------------------------------------------------------
+        # Chart 2: Top Keywords Across All Topics
+        # -----------------------------------------------------------
+        st.markdown("#### 🔤 Top Keywords Across All Topics")
+        st.caption("Most frequently occurring keywords across all discovered topics.")
+        # Extract all keywords, count frequencies
+        all_keywords = []
+        for kw_str in result.combined_topics["keywords"].dropna():
+            for kw in kw_str.split(","):
+                kw_clean = kw.strip().lower()
+                if kw_clean and len(kw_clean) > 2:
+                    all_keywords.append(kw_clean)
+        kw_counts = pd.Series(all_keywords).value_counts().head(25).reset_index()
+        kw_counts.columns = ["Keyword", "Frequency"]
+        fig2 = px.bar(
+            kw_counts,
+            x="Frequency",
+            y="Keyword",
+            orientation="h",
+            color="Frequency",
+            color_continuous_scale=["#302b63", "#667eea", "#f093fb", "#f5576c"],
+        )
+        fig2.update_traces(
+            marker=dict(line=dict(width=0)),
+        )
+        fig2.update_layout(
+            **PLOTLY_LAYOUT,
+            title="Top 25 Keywords by Frequency",
+            xaxis_title="Frequency (across all topics)",
+            yaxis_title="",
+            height=700,
+            coloraxis_colorbar=dict(
+                title="Freq",
+                tickfont=dict(color="#c4b5fd"),
+                title_font=dict(color="#e0e0e0"),
+            ),
+        )
+        # Override yaxis separately to avoid duplicate keyword with PLOTLY_LAYOUT
+        fig2.update_layout(
+            yaxis=dict(
+                autorange="reversed",
+                gridcolor="rgba(102, 126, 234, 0.1)",
+                tickfont=dict(color="#c4b5fd", size=12),
+            ),
+        )
+        st.plotly_chart(fig2, use_container_width=True)
+        st.markdown("---")
+        # -----------------------------------------------------------
+        # Chart 3: Taxonomy Distribution (Mapped vs Novel)
+        # -----------------------------------------------------------
+        st.markdown("#### 🧬 Taxonomy Classification Distribution")
+        st.caption("How topics are classified against the known research taxonomy.")
+        tax_meta = result.taxonomy_map.get("metadata", {})
+        tax_data = pd.DataFrame({
+            "Classification": ["MAPPED", "NOVEL"],
+            "Count": [tax_meta.get("mapped_count", 0), tax_meta.get("novel_count", 0)],
+        })
+        chart3_col1, chart3_col2 = st.columns(2)
+        with chart3_col1:
+            fig3a = px.pie(
+                tax_data,
+                values="Count",
+                names="Classification",
+                color="Classification",
+                color_discrete_map={
+                    "MAPPED": "#38ef7d",
+                    "NOVEL": "#f7b733",
+                },
+                hole=0.55,
+            )
+            fig3a.update_traces(
+                textfont=dict(color="#ffffff", size=14),
+                textinfo="percent+label",
+                marker=dict(line=dict(color="#1a1a2e", width=3)),
+            )
+            fig3a.update_layout(
+                paper_bgcolor="rgba(26, 26, 46, 0.95)",
+                plot_bgcolor="rgba(22, 33, 62, 0.95)",
+                font=dict(family="Inter, sans-serif", size=13, color="#e0e0e0"),
+                title=dict(text="Mapped vs Novel", font=dict(size=16, color="#ffffff")),
+                legend=dict(font=dict(color="#e0e0e0")),
+                height=380,
+                margin=dict(l=20, r=20, t=50, b=20),
+            )
+            st.plotly_chart(fig3a, use_container_width=True)
+        with chart3_col2:
+            fig3b = px.bar(
+                tax_data,
+                x="Classification",
+                y="Count",
+                color="Classification",
+                color_discrete_map={
+                    "MAPPED": "#38ef7d",
+                    "NOVEL": "#f7b733",
+                },
+                text="Count",
+            )
+            fig3b.update_traces(
+                textposition="outside",
+                textfont=dict(color="#e0e0e0", size=16, family="Inter"),
+                marker=dict(line=dict(width=0)),
+            )
+            fig3b.update_layout(
+                **PLOTLY_LAYOUT,
+                title="Classification Count",
+                xaxis_title="",
+                yaxis_title="Number of Topics",
+                showlegend=False,
+                height=380,
+            )
+            st.plotly_chart(fig3b, use_container_width=True)
+        st.markdown("---")
+        # -----------------------------------------------------------
+        # Chart 4: Alignment Distribution (from comparisons)
+        # -----------------------------------------------------------
+        if not result.comparison.empty:
+            st.markdown("#### ⚖️ Theme Alignment Distribution")
+            st.caption("Distribution of alignment strength between Title and Abstract topics.")
+            alignment_data = result.comparison["alignment"].value_counts().reset_index()
+            alignment_data.columns = ["Alignment", "Count"]
+            # Define order and colors
+            align_order = ["Strong", "Moderate", "Weak", "No Match"]
+            align_colors = {
+                "Strong": "#38ef7d",
+                "Moderate": "#4facfe",
+                "Weak": "#f7971e",
+                "No Match": "#f5576c",
+            }
+            fig4 = px.bar(
+                alignment_data,
+                x="Alignment",
+                y="Count",
+                color="Alignment",
+                color_discrete_map=align_colors,
+                text="Count",
+                category_orders={"Alignment": align_order},
+            )
+            fig4.update_traces(
+                textposition="outside",
+                textfont=dict(color="#e0e0e0", size=14, family="Inter"),
+                marker=dict(line=dict(width=0)),
+            )
+            fig4.update_layout(
+                **PLOTLY_LAYOUT,
+                title="Title ↔ Abstract Alignment Distribution",
+                xaxis_title="Alignment Level",
+                yaxis_title="Number of Topic Pairs",
+                showlegend=False,
+                height=420,
+            )
+            st.plotly_chart(fig4, use_container_width=True)
+            st.markdown("---")
+            # -----------------------------------------------------------
+            # Chart 5: Similarity Score Histogram
+            # -----------------------------------------------------------
+            st.markdown("#### 📐 Similarity Score Distribution")
+            st.caption("Distribution of Jaccard similarity scores between matched Title and Abstract topics.")
+            fig5 = px.histogram(
+                result.comparison,
+                x="similarity",
+                nbins=30,
+                color_discrete_sequence=["#667eea"],
+                marginal="box",
+            )
+            fig5.update_traces(
+                marker=dict(
+                    line=dict(width=1, color="#b8b5ff"),
+                ),
+                selector=dict(type="histogram"),
+            )
+            fig5.update_layout(
+                **PLOTLY_LAYOUT,
+                title="Similarity Score Histogram",
+                xaxis_title="Jaccard Similarity Score",
+                yaxis_title="Count",
+                height=420,
+                bargap=0.05,
+            )
+            st.plotly_chart(fig5, use_container_width=True)
+    # ---- Downloads Section ----
+    st.markdown('<div class="section-header">📥 Download Outputs</div>', unsafe_allow_html=True)
+    dcol1, dcol2, dcol3, dcol4 = st.columns(4)
+    with dcol1:
+        csv_data = result.combined_topics.to_csv(index=False)
+        st.download_button(
+            "⬇️ Topics Table (CSV)",
+            data=csv_data,
+            file_name="topics_table.csv",
+            mime="text/csv",
+            use_container_width=True,
+        )
+    with dcol2:
+        comp_data = result.comparison.to_csv(index=False)
+        st.download_button(
+            "⬇️ Comparison (CSV)",
+            data=comp_data,
+            file_name="comparison.csv",
+            mime="text/csv",
+            use_container_width=True,
+        )
+    with dcol3:
+        json_data = json.dumps(result.taxonomy_map, indent=2, ensure_ascii=False)
+        st.download_button(
+            "⬇️ Taxonomy Map (JSON)",
+            data=json_data,
+            file_name="taxonomy_map.json",
+            mime="application/json",
+            use_container_width=True,
+        )
+    with dcol4:
+        # Download review table if it exists
+        review_path = os.path.join(OUTPUT_DIR, "review_table.csv")
+        if os.path.exists(review_path):
+            with open(review_path, "r") as f:
+                review_data = f.read()
+            st.download_button(
+                "⬇️ Review Table (CSV)",
+                data=review_data,
+                file_name="review_table.csv",
+                mime="text/csv",
+                use_container_width=True,
+            )
+        else:
+            st.download_button(
+                "⬇️ Review Table (CSV)",
+                data="Not saved yet. Go to Review Table tab and click Save.",
+                file_name="review_table.csv",
+                mime="text/csv",
+                use_container_width=True,
+                disabled=True,
+            )
+    # ---- Auto-save comparison.csv and taxonomy_map.json to outputs ----
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    result.comparison.to_csv(os.path.join(OUTPUT_DIR, "comparison.csv"), index=False)
+    with open(os.path.join(OUTPUT_DIR, "taxonomy_map.json"), "w", encoding="utf-8") as f:
+        json.dump(result.taxonomy_map, f, indent=2, ensure_ascii=False)
+    # ---- Pipeline Log ----
+    with st.expander("📜 Pipeline Execution Log"):
+        st.markdown(f"**Status:** `{result.status}`")
+        st.markdown(f"**Steps Completed:** {len(result.steps_completed)}/9")
+        for i, step in enumerate(result.steps_completed, 1):
+            st.markdown(f"  ✅ Step {i}: `{step}`")
+        if result.errors:
+            st.markdown("**Errors:**")
+            for err in result.errors:
+                st.error(err)
+        st.markdown("**Exported Files:**")
+        for name, path in result.exported_files.items():
+            st.markdown(f"  📄 `{name}` → `{path}`")
+else:
+    # ---- Welcome / instructions when no results ----
+    st.markdown("""
+    <div class="info-box">
+        <strong>👋 Welcome!</strong><br><br>
+        This application uses an AI agent to perform comprehensive topic modeling on research papers.
+        <br><br>
+        <strong>How to use:</strong><br>
+        1️⃣ Upload a CSV file with <code>Title</code> and <code>Abstract</code> columns (or use the default dataset).<br>
+        2️⃣ Configure the minimum number of topics and label generation method in the sidebar.<br>
+        3️⃣ Click <strong>"🚀 Run Topic Modeling Agent"</strong> to start the analysis.<br>
+        4️⃣ Explore topics, comparisons, and taxonomy classification in the results tabs.<br>
+        5️⃣ Review and annotate topics in the <strong>✏️ Review Table</strong> tab.<br>
+        6️⃣ View interactive charts in the <strong>📈 Charts</strong> tab.<br>
+        7️⃣ Download all outputs as CSV and JSON files.
+    </div>
+    """, unsafe_allow_html=True)
+    st.markdown("<br>", unsafe_allow_html=True)
+    # Show a preview if default dataset exists
+    if os.path.exists("dataset.csv"):
+        with st.expander("👀 Preview Default Dataset", expanded=False):
+            try:
+                preview_df = pd.read_csv("dataset.csv", nrows=10)
+                st.markdown(f"**Columns:** {', '.join(preview_df.columns.tolist())}")
+                if "Title" in preview_df.columns:
+                    st.dataframe(preview_df[["Title", "Abstract"]].head(10) if "Abstract" in preview_df.columns else preview_df[["Title"]].head(10), use_container_width=True)
+                else:
+                    st.dataframe(preview_df.head(10), use_container_width=True)
+            except Exception as e:
+                st.warning(f"Could not preview dataset: {e}")

dataset.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a602dfcb3982c58156c67f4fb2565cc8ec9b4b2368a1b6ad4be3c621c1232218
+size 28342399

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+# Requirements for the AI Topic Modeling Agent
+# 13 packages as specified
+streamlit>=1.30.0
+pandas>=2.0.0
+numpy>=1.24.0
+scikit-learn>=1.3.0
+nltk>=3.8.0
+bertopic>=0.16.0
+umap-learn>=0.5.4
+hdbscan>=0.8.33
+sentence-transformers>=2.2.0
+groq>=0.4.0
+plotly>=5.18.0
+scipy>=1.11.0
+joblib>=1.3.0

tools.py ADDED Viewed

	@@ -0,0 +1,626 @@

+"""
+tools.py — Core functions for the AI-driven topic modeling pipeline.
+This module provides all analytical functions used by the TopicAgent:
+  - CSV ingestion and validation
+  - Text preprocessing (lowercasing, stopword removal, cleaning)
+  - Topic modeling via BERTopic (with fallback to sklearn LDA)
+  - Automatic human-readable label generation
+  - Cross-source theme comparison (Title vs Abstract)
+  - Taxonomy mapping (MAPPED / NOVEL classification)
+"""
+import re
+import json
+import logging
+from typing import Dict, List, Tuple, Optional, Any
+import numpy as np
+import pandas as pd
+import nltk
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+# ---------------------------------------------------------------------------
+# Logging
+# ---------------------------------------------------------------------------
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
+# ---------------------------------------------------------------------------
+# NLTK data download (idempotent)
+# ---------------------------------------------------------------------------
+for _resource in ("punkt", "punkt_tab", "stopwords"):
+    try:
+        nltk.data.find(f"tokenizers/{_resource}" if "punkt" in _resource else f"corpora/{_resource}")
+    except LookupError:
+        nltk.download(_resource, quiet=True)
+# ---------------------------------------------------------------------------
+# Reference taxonomy of known AI / business / research themes
+# Used by create_taxonomy_map() for MAPPED vs NOVEL classification
+# ---------------------------------------------------------------------------
+KNOWN_THEMES: List[str] = [
+    # AI / ML
+    "artificial intelligence", "machine learning", "deep learning", "neural network",
+    "natural language processing", "computer vision", "reinforcement learning",
+    "generative ai", "large language model", "transformer", "chatbot",
+    "recommendation system", "knowledge graph", "robotics", "autonomous",
+    "explainable ai", "federated learning", "transfer learning", "ai ethics",
+    "adversarial", "gan", "diffusion model", "prompt engineering",
+    # Data science
+    "data mining", "big data", "analytics", "data science", "data quality",
+    "feature engineering", "dimensionality reduction", "clustering", "classification",
+    "regression", "time series", "anomaly detection", "sentiment analysis",
+    # Business / Management
+    "digital transformation", "innovation", "strategy", "supply chain",
+    "customer experience", "marketing", "e-commerce", "fintech", "blockchain",
+    "sustainability", "corporate social responsibility", "knowledge management",
+    "decision support", "business intelligence", "enterprise", "organizational",
+    "human resource", "leadership", "entrepreneurship", "business model",
+    # Information systems
+    "information systems", "technology adoption", "user acceptance", "privacy",
+    "security", "trust", "social media", "online community", "platform",
+    "crowdsourcing", "cloud computing", "iot", "internet of things",
+    "software engineering", "agile", "devops", "digital platform",
+    # Healthcare / Society
+    "healthcare", "telemedicine", "electronic health", "public health",
+    "education", "e-learning", "smart city", "government", "policy",
+    "ethics", "fairness", "bias", "misinformation", "content moderation",
+    # Research methods
+    "survey", "experiment", "case study", "meta-analysis", "bibliometric",
+    "systematic review", "structural equation", "grounded theory",
+]
+# ===================================================================
+# 1. load_csv  — Ingest and validate the CSV dataset
+# ===================================================================
+def load_csv(filepath: str) -> pd.DataFrame:
+    """
+    Load a CSV file and ensure the required columns (Title, Abstract) exist.
+    Parameters
+    ----------
+    filepath : str
+        Path to the CSV file.
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame with at least 'Title' and 'Abstract' columns.
+    Raises
+    ------
+    FileNotFoundError
+        If the specified file does not exist.
+    ValueError
+        If required columns are missing.
+    """
+    logger.info("Loading CSV from %s", filepath)
+    df = pd.read_csv(filepath, encoding="utf-8", on_bad_lines="skip")
+    logger.info("Loaded %d rows × %d columns", len(df), len(df.columns))
+    # Validate required columns (case-insensitive match)
+    col_map = {c.strip().lower(): c for c in df.columns}
+    required = {"title", "abstract"}
+    missing = required - set(col_map.keys())
+    if missing:
+        raise ValueError(f"CSV is missing required columns: {missing}. Found: {list(df.columns)}")
+    # Rename to canonical form
+    df = df.rename(columns={col_map["title"]: "Title", col_map["abstract"]: "Abstract"})
+    # Drop rows where both Title and Abstract are empty
+    df = df.dropna(subset=["Title", "Abstract"], how="all").reset_index(drop=True)
+    df["Title"] = df["Title"].fillna("")
+    df["Abstract"] = df["Abstract"].fillna("")
+    logger.info("After cleaning: %d usable rows", len(df))
+    return df
+# ===================================================================
+# 2. preprocess_text  — Clean and normalise a list of text documents
+# ===================================================================
+def preprocess_text(documents: List[str]) -> List[str]:
+    """
+    Apply professional-grade text preprocessing:
+      1. Lowercase
+      2. Remove URLs, emails, special characters, digits
+      3. Tokenize
+      4. Remove stopwords (NLTK English)
+      5. Remove very short tokens (length ≤ 2)
+      6. Rejoin into cleaned strings
+    Parameters
+    ----------
+    documents : list of str
+        Raw text documents.
+    Returns
+    -------
+    list of str
+        Cleaned text documents.
+    """
+    stop_words = set(stopwords.words("english"))
+    # Extended stopwords common in academic abstracts
+    stop_words.update([
+        "©", "elsevier", "rights", "reserved", "doi", "http", "https",
+        "vol", "pp", "fig", "table", "journal", "author", "authors",
+        "study", "paper", "research", "results", "findings", "however",
+        "propose", "proposed", "approach", "using", "based", "also",
+        "show", "shows", "shown", "may", "used", "use", "one", "two",
+        "three", "new", "well", "within", "among", "across", "toward",
+        "towards", "et", "al", "ie", "eg", "cf", "thus", "therefore",
+        "moreover", "furthermore", "addition", "conclusion", "conclusions",
+    ])
+    cleaned: List[str] = []
+    for doc in documents:
+        if not isinstance(doc, str) or not doc.strip():
+            cleaned.append("")
+            continue
+        text = doc.lower()
+        # Remove URLs
+        text = re.sub(r"https?://\S+|www\.\S+", " ", text)
+        # Remove emails
+        text = re.sub(r"\S+@\S+", " ", text)
+        # Remove digits and special characters but keep spaces
+        text = re.sub(r"[^a-z\s]", " ", text)
+        # Collapse whitespace
+        text = re.sub(r"\s+", " ", text).strip()
+        # Tokenize and filter
+        tokens = word_tokenize(text)
+        tokens = [t for t in tokens if t not in stop_words and len(t) > 2]
+        cleaned.append(" ".join(tokens))
+    logger.info("Preprocessed %d documents", len(cleaned))
+    return cleaned
+# ===================================================================
+# 3. run_topic_modeling  — Discover topics via BERTopic (or LDA fallback)
+# ===================================================================
+def run_topic_modeling(
+    documents: List[str],
+    source_label: str = "documents",
+    min_topics: int = 100,
+    use_bertopic: bool = True,
+) -> Tuple[pd.DataFrame, Any]:
+    """
+    Perform topic modeling on a corpus of preprocessed documents.
+    Strategy:
+      1. Try BERTopic with UMAP + HDBSCAN.  If the result has < min_topics,
+         automatically fall back to sklearn LDA.
+      2. LDA is configured with n_components = min_topics to guarantee the
+         requested topic count.
+    Parameters
+    ----------
+    documents : list of str
+        Preprocessed text documents.
+    source_label : str
+        Label for logging (e.g. "Titles" or "Abstracts").
+    min_topics : int
+        Minimum number of topics required (default 100).
+    use_bertopic : bool
+        Whether to attempt BERTopic first.
+    Returns
+    -------
+    topics_df : pd.DataFrame
+        Columns: topic_id, keywords (comma-separated), representative_docs
+    model : object
+        The fitted topic model for downstream inspection.
+    """
+    # Filter out empty documents
+    valid_docs = [d for d in documents if d.strip()]
+    if len(valid_docs) < 20:
+        raise ValueError(f"Not enough valid documents ({len(valid_docs)}) for topic modeling.")
+    logger.info("Running topic modeling on %d %s (target ≥ %d topics)", len(valid_docs), source_label, min_topics)
+    topics_df = None
+    model = None
+    # ------ Attempt BERTopic ------
+    if use_bertopic:
+        try:
+            topics_df, model = _run_bertopic(valid_docs, source_label, min_topics)
+        except Exception as exc:
+            logger.warning("BERTopic failed (%s). Falling back to LDA.", exc)
+            topics_df = None
+    # ------ Fallback to LDA if needed ------
+    if topics_df is None or len(topics_df) < min_topics:
+        logger.info("Using LDA to guarantee ≥ %d topics for %s", min_topics, source_label)
+        topics_df, model = _run_lda(valid_docs, source_label, min_topics)
+    logger.info("Topic modeling complete for %s: %d topics discovered", source_label, len(topics_df))
+    return topics_df, model
+def _run_bertopic(docs: List[str], source_label: str, min_topics: int):
+    """Run BERTopic with tuned parameters."""
+    from bertopic import BERTopic
+    from umap import UMAP
+    from hdbscan import HDBSCAN
+    from sklearn.feature_extraction.text import CountVectorizer
+    umap_model = UMAP(
+        n_neighbors=10,
+        n_components=5,
+        min_dist=0.0,
+        metric="cosine",
+        random_state=42,
+    )
+    hdbscan_model = HDBSCAN(
+        min_cluster_size=5,
+        min_samples=2,
+        prediction_data=True,
+    )
+    vectorizer = CountVectorizer(
+        stop_words="english",
+        ngram_range=(1, 2),
+        max_df=0.90,
+        min_df=2,
+    )
+    topic_model = BERTopic(
+        umap_model=umap_model,
+        hdbscan_model=hdbscan_model,
+        vectorizer_model=vectorizer,
+        nr_topics="auto",
+        top_n_words=10,
+        verbose=False,
+    )
+    topics, _probs = topic_model.fit_transform(docs)
+    info = topic_model.get_topic_info()
+    # Exclude outlier topic (-1)
+    info = info[info["Topic"] != -1].reset_index(drop=True)
+    rows = []
+    for _, row in info.iterrows():
+        tid = int(row["Topic"])
+        topic_words = topic_model.get_topic(tid)
+        kw = ", ".join([w for w, _ in topic_words[:10]])
+        rows.append({"topic_id": tid, "keywords": kw, "source": source_label})
+    df = pd.DataFrame(rows)
+    return df, topic_model
+def _run_lda(docs: List[str], source_label: str, n_topics: int):
+    """Run sklearn LDA to guarantee the requested number of topics."""
+    from sklearn.decomposition import LatentDirichletAllocation
+    from sklearn.feature_extraction.text import CountVectorizer
+    vectorizer = CountVectorizer(
+        stop_words="english",
+        max_df=0.90,
+        min_df=2,
+        ngram_range=(1, 2),
+        max_features=10000,
+    )
+    dtm = vectorizer.fit_transform(docs)
+    feature_names = vectorizer.get_feature_names_out()
+    lda = LatentDirichletAllocation(
+        n_components=n_topics,
+        max_iter=25,
+        learning_method="online",
+        random_state=42,
+        n_jobs=-1,
+    )
+    lda.fit(dtm)
+    rows = []
+    for idx, component in enumerate(lda.components_):
+        top_indices = component.argsort()[-10:][::-1]
+        kw = ", ".join([feature_names[i] for i in top_indices])
+        rows.append({"topic_id": idx, "keywords": kw, "source": source_label})
+    df = pd.DataFrame(rows)
+    return df, lda
+# ===================================================================
+# 4. generate_labels  — Create human-readable labels for each topic
+# ===================================================================
+def generate_labels(
+    topics_df: pd.DataFrame,
+    use_llm: bool = False,
+    groq_api_key: Optional[str] = None,
+) -> pd.DataFrame:
+    """
+    Generate a short human-readable label for every topic.
+    Strategy:
+      - If use_llm=True and a Groq API key is provided, use the Groq LLM
+        (llama-3.3-70b-versatile, free tier) to produce contextual labels.
+      - Otherwise, apply a heuristic: capitalise the first 3–4 keywords.
+    Parameters
+    ----------
+    topics_df : pd.DataFrame
+        Must contain columns 'topic_id' and 'keywords'.
+    use_llm : bool
+        Whether to use the Groq LLM for label generation.
+    groq_api_key : str, optional
+        Groq API key, required if use_llm is True.
+    Returns
+    -------
+    pd.DataFrame
+        Same DataFrame with an additional 'label' column.
+    """
+    if use_llm and groq_api_key:
+        logger.info("Generating labels using Groq LLM …")
+        topics_df = _generate_labels_llm(topics_df, groq_api_key)
+    else:
+        logger.info("Generating labels using keyword heuristic …")
+        topics_df = _generate_labels_heuristic(topics_df)
+    return topics_df
+def _generate_labels_heuristic(df: pd.DataFrame) -> pd.DataFrame:
+    """Create labels from the top keywords of each topic."""
+    labels = []
+    for _, row in df.iterrows():
+        kws = [kw.strip() for kw in row["keywords"].split(",")]
+        # Take the first 3-4 non-trivial keywords and title-case them
+        candidates = [kw.title() for kw in kws if len(kw) > 2][:4]
+        label = " / ".join(candidates) if candidates else f"Topic {row['topic_id']}"
+        labels.append(label)
+    df = df.copy()
+    df["label"] = labels
+    return df
+def _generate_labels_llm(df: pd.DataFrame, api_key: str) -> pd.DataFrame:
+    """Use Groq API to generate contextual labels for topics (batched)."""
+    import time
+    try:
+        from groq import Groq
+    except ImportError:
+        logger.warning("groq package not installed. Falling back to heuristic labels.")
+        return _generate_labels_heuristic(df)
+    client = Groq(api_key=api_key)
+    labels = []
+    # Process in batches to avoid rate limits
+    batch_size = 10
+    for batch_start in range(0, len(df), batch_size):
+        batch = df.iloc[batch_start:batch_start + batch_size]
+        prompt_lines = []
+        for _, row in batch.iterrows():
+            prompt_lines.append(f"Topic {row['topic_id']}: keywords = [{row['keywords']}]")
+        prompt = (
+            "You are a research taxonomy expert. For each topic below, "
+            "generate a concise, descriptive label (3-6 words) that captures "
+            "the theme of the keywords. Return ONLY a JSON list of objects "
+            'with keys "topic_id" and "label". No extra text.\n\n'
+            + "\n".join(prompt_lines)
+        )
+        try:
+            chat = client.chat.completions.create(
+                model="llama-3.3-70b-versatile",
+                messages=[{"role": "user", "content": prompt}],
+                temperature=0.3,
+                max_tokens=1024,
+            )
+            resp = chat.choices[0].message.content.strip()
+            # Parse JSON from the response
+            # Find JSON array in response
+            json_match = re.search(r"\[.*\]", resp, re.DOTALL)
+            if json_match:
+                batch_labels = json.loads(json_match.group())
+                label_map = {item["topic_id"]: item["label"] for item in batch_labels}
+                for _, row in batch.iterrows():
+                    labels.append(label_map.get(row["topic_id"], f"Topic {row['topic_id']}"))
+            else:
+                # Fallback for this batch
+                for _, row in batch.iterrows():
+                    kws = [kw.strip().title() for kw in row["keywords"].split(",")][:4]
+                    labels.append(" / ".join(kws))
+        except Exception as exc:
+            logger.warning("Groq API error for batch starting at %d: %s", batch_start, exc)
+            for _, row in batch.iterrows():
+                kws = [kw.strip().title() for kw in row["keywords"].split(",")][:4]
+                labels.append(" / ".join(kws))
+        # Rate-limit courtesy delay
+        time.sleep(0.5)
+    df = df.copy()
+    df["label"] = labels
+    return df
+# ===================================================================
+# 5. compare_themes  — Cross-compare title vs abstract topics
+# ===================================================================
+def compare_themes(
+    title_topics: pd.DataFrame,
+    abstract_topics: pd.DataFrame,
+) -> pd.DataFrame:
+    """
+    Build a comparison table showing dominant themes from titles and
+    abstracts side-by-side.
+    Matching strategy:
+      - Compute keyword overlap (Jaccard similarity) between every
+        title-topic and abstract-topic pair.
+      - For each title-topic, find the best matching abstract-topic.
+      - Report similarity score and alignment status.
+    Parameters
+    ----------
+    title_topics : pd.DataFrame
+        Topics extracted from titles (with 'topic_id', 'keywords', 'label').
+    abstract_topics : pd.DataFrame
+        Topics extracted from abstracts (with 'topic_id', 'keywords', 'label').
+    Returns
+    -------
+    pd.DataFrame
+        Comparison table with columns:
+          title_topic_id, title_label, title_keywords,
+          abstract_topic_id, abstract_label, abstract_keywords,
+          similarity, alignment
+    """
+    logger.info("Comparing themes: %d title topics × %d abstract topics",
+                len(title_topics), len(abstract_topics))
+    def _keywords_set(kw_str: str) -> set:
+        return set(kw.strip().lower() for kw in kw_str.split(",") if kw.strip())
+    rows = []
+    for _, t_row in title_topics.iterrows():
+        t_kws = _keywords_set(t_row["keywords"])
+        best_sim = 0.0
+        best_match = None
+        for _, a_row in abstract_topics.iterrows():
+            a_kws = _keywords_set(a_row["keywords"])
+            if not t_kws or not a_kws:
+                continue
+            # Jaccard similarity
+            intersection = len(t_kws & a_kws)
+            union = len(t_kws | a_kws)
+            sim = intersection / union if union else 0.0
+            if sim > best_sim:
+                best_sim = sim
+                best_match = a_row
+        alignment = (
+            "Strong" if best_sim >= 0.4
+            else "Moderate" if best_sim >= 0.2
+            else "Weak" if best_sim > 0
+            else "No Match"
+        )
+        rows.append({
+            "title_topic_id": t_row["topic_id"],
+            "title_label": t_row.get("label", ""),
+            "title_keywords": t_row["keywords"],
+            "abstract_topic_id": best_match["topic_id"] if best_match is not None else None,
+            "abstract_label": best_match.get("label", "") if best_match is not None else "",
+            "abstract_keywords": best_match["keywords"] if best_match is not None else "",
+            "similarity": round(best_sim, 4),
+            "alignment": alignment,
+        })
+    comparison_df = pd.DataFrame(rows)
+    logger.info("Theme comparison complete: %d rows", len(comparison_df))
+    return comparison_df
+# ===================================================================
+# 6. create_taxonomy_map  — Classify themes as MAPPED or NOVEL
+# ===================================================================
+def create_taxonomy_map(
+    topics_df: pd.DataFrame,
+    known_themes: Optional[List[str]] = None,
+    threshold: float = 0.15,
+) -> Dict[str, Any]:
+    """
+    Classify each topic as either MAPPED (similar to a well-known
+    AI / business / IS research theme) or NOVEL (previously unseen).
+    Heuristic:
+      For each topic's keyword set, compute its best token-overlap
+      ratio against the known themes list.  If the ratio exceeds the
+      threshold, label it as MAPPED; otherwise NOVEL.
+    Parameters
+    ----------
+    topics_df : pd.DataFrame
+        Must contain 'topic_id', 'keywords', and 'label' columns.
+    known_themes : list of str, optional
+        Reference themes (defaults to the built-in KNOWN_THEMES).
+    threshold : float
+        Minimum overlap ratio to classify as MAPPED.
+    Returns
+    -------
+    dict
+        JSON-serialisable taxonomy map:
+        {
+          "metadata": { ... },
+          "mapped": [ {topic_id, label, keywords, matched_theme, score}, ... ],
+          "novel":  [ {topic_id, label, keywords, score}, ... ],
+        }
+    """
+    if known_themes is None:
+        known_themes = KNOWN_THEMES
+    logger.info("Building taxonomy map for %d topics (threshold=%.2f)", len(topics_df), threshold)
+    mapped: List[Dict] = []
+    novel: List[Dict] = []
+    known_tokens_list = [set(theme.lower().split()) for theme in known_themes]
+    for _, row in topics_df.iterrows():
+        topic_tokens = set(
+            kw.strip().lower()
+            for kw in row["keywords"].split(",")
+            if kw.strip()
+        )
+        # Also include individual words from multi-word keywords
+        expanded_tokens = set()
+        for token in topic_tokens:
+            expanded_tokens.update(token.split())
+        expanded_tokens.update(topic_tokens)
+        best_score = 0.0
+        best_theme = ""
+        for theme_str, theme_tokens in zip(known_themes, known_tokens_list):
+            if not expanded_tokens or not theme_tokens:
+                continue
+            intersection = len(expanded_tokens & theme_tokens)
+            union_size = len(expanded_tokens | theme_tokens)
+            score = intersection / union_size if union_size else 0.0
+            if score > best_score:
+                best_score = score
+                best_theme = theme_str
+        entry = {
+            "topic_id": int(row["topic_id"]),
+            "label": row.get("label", ""),
+            "keywords": row["keywords"],
+            "score": round(best_score, 4),
+        }
+        if best_score >= threshold:
+            entry["matched_theme"] = best_theme
+            entry["classification"] = "MAPPED"
+            mapped.append(entry)
+        else:
+            entry["classification"] = "NOVEL"
+            novel.append(entry)
+    taxonomy = {
+        "metadata": {
+            "total_topics": len(topics_df),
+            "mapped_count": len(mapped),
+            "novel_count": len(novel),
+            "threshold": threshold,
+        },
+        "mapped": mapped,
+        "novel": novel,
+    }
+    logger.info("Taxonomy: %d MAPPED, %d NOVEL", len(mapped), len(novel))
+    return taxonomy