Spaces:
Sleeping
Sleeping
Upload 7 files
Browse files- .gitattributes +1 -0
- Dockerfile +41 -0
- README.md +10 -0
- agent.py +287 -0
- app.py +1191 -0
- dataset.csv +3 -0
- requirements.txt +16 -0
- tools.py +626 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
dataset.csv filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
# Set working directory
|
| 4 |
+
WORKDIR /app
|
| 5 |
+
|
| 6 |
+
# Install system dependencies needed by some Python packages
|
| 7 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 8 |
+
build-essential \
|
| 9 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
+
|
| 11 |
+
# Copy requirements first (Docker layer caching)
|
| 12 |
+
COPY requirements.txt .
|
| 13 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 14 |
+
|
| 15 |
+
# Download NLTK data at build time so it's baked into the image
|
| 16 |
+
RUN python -c "import nltk; nltk.download('punkt'); nltk.download('punkt_tab'); nltk.download('stopwords')"
|
| 17 |
+
|
| 18 |
+
# Copy application code
|
| 19 |
+
COPY app.py .
|
| 20 |
+
COPY agent.py .
|
| 21 |
+
COPY tools.py .
|
| 22 |
+
|
| 23 |
+
# Copy dataset (bundled as the default dataset)
|
| 24 |
+
COPY dataset.csv .
|
| 25 |
+
|
| 26 |
+
# Create writable outputs directory
|
| 27 |
+
RUN mkdir -p /app/outputs && chmod 777 /app/outputs
|
| 28 |
+
|
| 29 |
+
# Expose Streamlit port
|
| 30 |
+
EXPOSE 8501
|
| 31 |
+
|
| 32 |
+
# Health check
|
| 33 |
+
HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health || exit 1
|
| 34 |
+
|
| 35 |
+
# Run Streamlit
|
| 36 |
+
CMD ["streamlit", "run", "app.py", \
|
| 37 |
+
"--server.port=8501", \
|
| 38 |
+
"--server.address=0.0.0.0", \
|
| 39 |
+
"--server.headless=true", \
|
| 40 |
+
"--browser.gatherUsageStats=false", \
|
| 41 |
+
"--server.fileWatcherType=none"]
|
README.md
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Research Topic Modeler
|
| 3 |
+
emoji: 🔬
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 8501
|
| 8 |
+
pinned: false
|
| 9 |
+
license: mit
|
| 10 |
+
---
|
agent.py
ADDED
|
@@ -0,0 +1,287 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
agent.py — TopicAgent orchestrates the end-to-end topic modeling workflow.
|
| 3 |
+
|
| 4 |
+
This module defines the TopicAgent class, which:
|
| 5 |
+
1. Loads and validates the CSV dataset.
|
| 6 |
+
2. Preprocesses text for Titles and Abstracts separately.
|
| 7 |
+
3. Runs topic modeling on each corpus (≥100 topics guaranteed).
|
| 8 |
+
4. Generates human-readable labels for every topic.
|
| 9 |
+
5. Compares dominant themes across Title and Abstract topics.
|
| 10 |
+
6. Produces a taxonomy map (MAPPED / NOVEL classification).
|
| 11 |
+
7. Exports structured outputs: topics table, comparison CSV, taxonomy JSON.
|
| 12 |
+
|
| 13 |
+
Usage:
|
| 14 |
+
agent = TopicAgent(csv_path="dataset.csv")
|
| 15 |
+
results = agent.run()
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
import os
|
| 19 |
+
import json
|
| 20 |
+
import logging
|
| 21 |
+
from dataclasses import dataclass, field
|
| 22 |
+
from typing import Dict, Any, Optional
|
| 23 |
+
|
| 24 |
+
import pandas as pd
|
| 25 |
+
|
| 26 |
+
from tools import (
|
| 27 |
+
load_csv,
|
| 28 |
+
preprocess_text,
|
| 29 |
+
run_topic_modeling,
|
| 30 |
+
generate_labels,
|
| 31 |
+
compare_themes,
|
| 32 |
+
create_taxonomy_map,
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
# ---------------------------------------------------------------------------
|
| 36 |
+
# Logging
|
| 37 |
+
# ---------------------------------------------------------------------------
|
| 38 |
+
logger = logging.getLogger(__name__)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
# ---------------------------------------------------------------------------
|
| 42 |
+
# Structured result container
|
| 43 |
+
# ---------------------------------------------------------------------------
|
| 44 |
+
@dataclass
|
| 45 |
+
class AgentResult:
|
| 46 |
+
"""Container for all outputs produced by the TopicAgent."""
|
| 47 |
+
# Core dataframes
|
| 48 |
+
title_topics: pd.DataFrame = field(default_factory=pd.DataFrame)
|
| 49 |
+
abstract_topics: pd.DataFrame = field(default_factory=pd.DataFrame)
|
| 50 |
+
combined_topics: pd.DataFrame = field(default_factory=pd.DataFrame)
|
| 51 |
+
comparison: pd.DataFrame = field(default_factory=pd.DataFrame)
|
| 52 |
+
|
| 53 |
+
# Taxonomy map (dict serialisable to JSON)
|
| 54 |
+
taxonomy_map: Dict[str, Any] = field(default_factory=dict)
|
| 55 |
+
|
| 56 |
+
# Execution metadata
|
| 57 |
+
status: str = "pending"
|
| 58 |
+
steps_completed: list = field(default_factory=list)
|
| 59 |
+
errors: list = field(default_factory=list)
|
| 60 |
+
|
| 61 |
+
# File paths of exported artefacts
|
| 62 |
+
exported_files: Dict[str, str] = field(default_factory=dict)
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
# ---------------------------------------------------------------------------
|
| 66 |
+
# TopicAgent
|
| 67 |
+
# ---------------------------------------------------------------------------
|
| 68 |
+
class TopicAgent:
|
| 69 |
+
"""
|
| 70 |
+
Orchestrates the research-paper topic modeling pipeline.
|
| 71 |
+
|
| 72 |
+
Parameters
|
| 73 |
+
----------
|
| 74 |
+
csv_path : str
|
| 75 |
+
Path to the input CSV file.
|
| 76 |
+
output_dir : str
|
| 77 |
+
Directory to write output files.
|
| 78 |
+
min_topics : int
|
| 79 |
+
Minimum number of topics to generate per source (default 100).
|
| 80 |
+
use_llm_labels : bool
|
| 81 |
+
Whether to use Groq LLM for label generation.
|
| 82 |
+
groq_api_key : str, optional
|
| 83 |
+
API key for Groq (used only when use_llm_labels is True).
|
| 84 |
+
"""
|
| 85 |
+
|
| 86 |
+
def __init__(
|
| 87 |
+
self,
|
| 88 |
+
csv_path: str,
|
| 89 |
+
output_dir: str = "outputs",
|
| 90 |
+
min_topics: int = 100,
|
| 91 |
+
use_llm_labels: bool = False,
|
| 92 |
+
groq_api_key: Optional[str] = None,
|
| 93 |
+
):
|
| 94 |
+
self.csv_path = csv_path
|
| 95 |
+
self.output_dir = output_dir
|
| 96 |
+
self.min_topics = min_topics
|
| 97 |
+
self.use_llm_labels = use_llm_labels
|
| 98 |
+
self.groq_api_key = groq_api_key
|
| 99 |
+
|
| 100 |
+
# Ensure output directory exists
|
| 101 |
+
os.makedirs(self.output_dir, exist_ok=True)
|
| 102 |
+
|
| 103 |
+
self._result = AgentResult()
|
| 104 |
+
|
| 105 |
+
# -----------------------------------------------------------------
|
| 106 |
+
# Public interface
|
| 107 |
+
# -----------------------------------------------------------------
|
| 108 |
+
def run(self) -> AgentResult:
|
| 109 |
+
"""
|
| 110 |
+
Execute the full pipeline step by step.
|
| 111 |
+
|
| 112 |
+
Returns
|
| 113 |
+
-------
|
| 114 |
+
AgentResult
|
| 115 |
+
Structured results including all DataFrames, taxonomy, and file paths.
|
| 116 |
+
"""
|
| 117 |
+
logger.info("=" * 60)
|
| 118 |
+
logger.info("TopicAgent — Starting pipeline")
|
| 119 |
+
logger.info("=" * 60)
|
| 120 |
+
|
| 121 |
+
try:
|
| 122 |
+
# Step 1: Load CSV
|
| 123 |
+
self._step_load_csv()
|
| 124 |
+
|
| 125 |
+
# Step 2: Preprocess text
|
| 126 |
+
self._step_preprocess()
|
| 127 |
+
|
| 128 |
+
# Step 3: Topic modeling on Titles
|
| 129 |
+
self._step_model_titles()
|
| 130 |
+
|
| 131 |
+
# Step 4: Topic modeling on Abstracts
|
| 132 |
+
self._step_model_abstracts()
|
| 133 |
+
|
| 134 |
+
# Step 5: Generate labels
|
| 135 |
+
self._step_generate_labels()
|
| 136 |
+
|
| 137 |
+
# Step 6: Build combined topics table
|
| 138 |
+
self._step_combine_topics()
|
| 139 |
+
|
| 140 |
+
# Step 7: Compare themes
|
| 141 |
+
self._step_compare_themes()
|
| 142 |
+
|
| 143 |
+
# Step 8: Create taxonomy map
|
| 144 |
+
self._step_taxonomy_map()
|
| 145 |
+
|
| 146 |
+
# Step 9: Export outputs
|
| 147 |
+
self._step_export()
|
| 148 |
+
|
| 149 |
+
self._result.status = "success"
|
| 150 |
+
logger.info("Pipeline completed successfully.")
|
| 151 |
+
|
| 152 |
+
except Exception as exc:
|
| 153 |
+
self._result.status = "failed"
|
| 154 |
+
self._result.errors.append(str(exc))
|
| 155 |
+
logger.error("Pipeline failed: %s", exc, exc_info=True)
|
| 156 |
+
|
| 157 |
+
return self._result
|
| 158 |
+
|
| 159 |
+
# -----------------------------------------------------------------
|
| 160 |
+
# Pipeline steps
|
| 161 |
+
# -----------------------------------------------------------------
|
| 162 |
+
def _step_load_csv(self):
|
| 163 |
+
"""Step 1 — Ingest CSV dataset."""
|
| 164 |
+
logger.info("Step 1/9: Loading CSV …")
|
| 165 |
+
self._df = load_csv(self.csv_path)
|
| 166 |
+
self._result.steps_completed.append("load_csv")
|
| 167 |
+
logger.info(" → %d papers loaded.", len(self._df))
|
| 168 |
+
|
| 169 |
+
def _step_preprocess(self):
|
| 170 |
+
"""Step 2 — Preprocess Title and Abstract text."""
|
| 171 |
+
logger.info("Step 2/9: Preprocessing text …")
|
| 172 |
+
self._titles_clean = preprocess_text(self._df["Title"].tolist())
|
| 173 |
+
self._abstracts_clean = preprocess_text(self._df["Abstract"].tolist())
|
| 174 |
+
self._result.steps_completed.append("preprocess_text")
|
| 175 |
+
logger.info(" → Titles preprocessed: %d docs", len(self._titles_clean))
|
| 176 |
+
logger.info(" → Abstracts preprocessed: %d docs", len(self._abstracts_clean))
|
| 177 |
+
|
| 178 |
+
def _step_model_titles(self):
|
| 179 |
+
"""Step 3 — Topic modeling on Titles."""
|
| 180 |
+
logger.info("Step 3/9: Topic modeling on Titles …")
|
| 181 |
+
self._title_topics_df, self._title_model = run_topic_modeling(
|
| 182 |
+
self._titles_clean,
|
| 183 |
+
source_label="Titles",
|
| 184 |
+
min_topics=self.min_topics,
|
| 185 |
+
)
|
| 186 |
+
self._result.steps_completed.append("topic_modeling_titles")
|
| 187 |
+
logger.info(" → %d title topics discovered.", len(self._title_topics_df))
|
| 188 |
+
|
| 189 |
+
def _step_model_abstracts(self):
|
| 190 |
+
"""Step 4 — Topic modeling on Abstracts."""
|
| 191 |
+
logger.info("Step 4/9: Topic modeling on Abstracts …")
|
| 192 |
+
self._abstract_topics_df, self._abstract_model = run_topic_modeling(
|
| 193 |
+
self._abstracts_clean,
|
| 194 |
+
source_label="Abstracts",
|
| 195 |
+
min_topics=self.min_topics,
|
| 196 |
+
)
|
| 197 |
+
self._result.steps_completed.append("topic_modeling_abstracts")
|
| 198 |
+
logger.info(" → %d abstract topics discovered.", len(self._abstract_topics_df))
|
| 199 |
+
|
| 200 |
+
def _step_generate_labels(self):
|
| 201 |
+
"""Step 5 — Generate human-readable labels."""
|
| 202 |
+
logger.info("Step 5/9: Generating topic labels …")
|
| 203 |
+
self._title_topics_df = generate_labels(
|
| 204 |
+
self._title_topics_df,
|
| 205 |
+
use_llm=self.use_llm_labels,
|
| 206 |
+
groq_api_key=self.groq_api_key,
|
| 207 |
+
)
|
| 208 |
+
self._abstract_topics_df = generate_labels(
|
| 209 |
+
self._abstract_topics_df,
|
| 210 |
+
use_llm=self.use_llm_labels,
|
| 211 |
+
groq_api_key=self.groq_api_key,
|
| 212 |
+
)
|
| 213 |
+
self._result.title_topics = self._title_topics_df
|
| 214 |
+
self._result.abstract_topics = self._abstract_topics_df
|
| 215 |
+
self._result.steps_completed.append("generate_labels")
|
| 216 |
+
logger.info(" → Labels generated for all topics.")
|
| 217 |
+
|
| 218 |
+
def _step_combine_topics(self):
|
| 219 |
+
"""Step 6 — Combine title and abstract topics into one table."""
|
| 220 |
+
logger.info("Step 6/9: Building combined topics table …")
|
| 221 |
+
combined = pd.concat(
|
| 222 |
+
[self._title_topics_df, self._abstract_topics_df],
|
| 223 |
+
ignore_index=True,
|
| 224 |
+
)
|
| 225 |
+
combined["global_id"] = range(len(combined))
|
| 226 |
+
self._result.combined_topics = combined
|
| 227 |
+
self._result.steps_completed.append("combine_topics")
|
| 228 |
+
logger.info(" → Combined table: %d topics total.", len(combined))
|
| 229 |
+
|
| 230 |
+
def _step_compare_themes(self):
|
| 231 |
+
"""Step 7 — Compare title vs abstract themes."""
|
| 232 |
+
logger.info("Step 7/9: Comparing title vs abstract themes …")
|
| 233 |
+
comparison = compare_themes(self._title_topics_df, self._abstract_topics_df)
|
| 234 |
+
self._result.comparison = comparison
|
| 235 |
+
self._result.steps_completed.append("compare_themes")
|
| 236 |
+
logger.info(" → Comparison table: %d rows.", len(comparison))
|
| 237 |
+
|
| 238 |
+
def _step_taxonomy_map(self):
|
| 239 |
+
"""Step 8 — Create taxonomy map (MAPPED / NOVEL)."""
|
| 240 |
+
logger.info("Step 8/9: Building taxonomy map …")
|
| 241 |
+
# Use the combined topics for taxonomy
|
| 242 |
+
taxonomy = create_taxonomy_map(self._result.combined_topics)
|
| 243 |
+
self._result.taxonomy_map = taxonomy
|
| 244 |
+
self._result.steps_completed.append("create_taxonomy_map")
|
| 245 |
+
logger.info(
|
| 246 |
+
" → MAPPED: %d, NOVEL: %d",
|
| 247 |
+
taxonomy["metadata"]["mapped_count"],
|
| 248 |
+
taxonomy["metadata"]["novel_count"],
|
| 249 |
+
)
|
| 250 |
+
|
| 251 |
+
def _step_export(self):
|
| 252 |
+
"""Step 9 — Export all outputs to disk."""
|
| 253 |
+
logger.info("Step 9/9: Exporting outputs …")
|
| 254 |
+
|
| 255 |
+
# (a) Combined topics table CSV
|
| 256 |
+
topics_path = os.path.join(self.output_dir, "topics_table.csv")
|
| 257 |
+
self._result.combined_topics.to_csv(topics_path, index=False)
|
| 258 |
+
self._result.exported_files["topics_table"] = topics_path
|
| 259 |
+
logger.info(" → Saved: %s", topics_path)
|
| 260 |
+
|
| 261 |
+
# (b) Comparison CSV
|
| 262 |
+
comparison_path = os.path.join(self.output_dir, "comparison.csv")
|
| 263 |
+
self._result.comparison.to_csv(comparison_path, index=False)
|
| 264 |
+
self._result.exported_files["comparison"] = comparison_path
|
| 265 |
+
logger.info(" → Saved: %s", comparison_path)
|
| 266 |
+
|
| 267 |
+
# (c) Taxonomy map JSON
|
| 268 |
+
taxonomy_path = os.path.join(self.output_dir, "taxonomy_map.json")
|
| 269 |
+
with open(taxonomy_path, "w", encoding="utf-8") as f:
|
| 270 |
+
json.dump(self._result.taxonomy_map, f, indent=2, ensure_ascii=False)
|
| 271 |
+
self._result.exported_files["taxonomy_map"] = taxonomy_path
|
| 272 |
+
logger.info(" → Saved: %s", taxonomy_path)
|
| 273 |
+
|
| 274 |
+
# (d) Title topics CSV
|
| 275 |
+
title_path = os.path.join(self.output_dir, "title_topics.csv")
|
| 276 |
+
self._result.title_topics.to_csv(title_path, index=False)
|
| 277 |
+
self._result.exported_files["title_topics"] = title_path
|
| 278 |
+
logger.info(" → Saved: %s", title_path)
|
| 279 |
+
|
| 280 |
+
# (e) Abstract topics CSV
|
| 281 |
+
abstract_path = os.path.join(self.output_dir, "abstract_topics.csv")
|
| 282 |
+
self._result.abstract_topics.to_csv(abstract_path, index=False)
|
| 283 |
+
self._result.exported_files["abstract_topics"] = abstract_path
|
| 284 |
+
logger.info(" → Saved: %s", abstract_path)
|
| 285 |
+
|
| 286 |
+
self._result.steps_completed.append("export")
|
| 287 |
+
logger.info(" → All outputs exported successfully.")
|
app.py
ADDED
|
@@ -0,0 +1,1191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
app.py — Streamlit frontend for the AI-driven Topic Modeling application.
|
| 3 |
+
|
| 4 |
+
This module provides an interactive web interface that allows users to:
|
| 5 |
+
1. Upload a CSV file containing research paper Titles and Abstracts.
|
| 6 |
+
2. Configure pipeline parameters (min topics, LLM label generation).
|
| 7 |
+
3. Run the TopicAgent pipeline with a single click.
|
| 8 |
+
4. View and explore results: topics table, comparison, taxonomy map.
|
| 9 |
+
5. Review topics with an editable review table.
|
| 10 |
+
6. Visualize topic distributions with interactive Plotly charts.
|
| 11 |
+
7. Download all generated outputs (CSV, JSON).
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import os
|
| 15 |
+
import json
|
| 16 |
+
import tempfile
|
| 17 |
+
|
| 18 |
+
import streamlit as st
|
| 19 |
+
import pandas as pd
|
| 20 |
+
import plotly.express as px
|
| 21 |
+
import plotly.graph_objects as go
|
| 22 |
+
|
| 23 |
+
from agent import TopicAgent
|
| 24 |
+
|
| 25 |
+
# ---------------------------------------------------------------------------
|
| 26 |
+
# HuggingFace Spaces compatibility: use a writable output directory
|
| 27 |
+
# On HF Spaces the working directory can be read-only, so fall back to /tmp
|
| 28 |
+
# ---------------------------------------------------------------------------
|
| 29 |
+
OUTPUT_DIR = "outputs"
|
| 30 |
+
try:
|
| 31 |
+
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 32 |
+
# Test write access
|
| 33 |
+
_test_path = os.path.join(OUTPUT_DIR, ".write_test")
|
| 34 |
+
with open(_test_path, "w") as _f:
|
| 35 |
+
_f.write("ok")
|
| 36 |
+
os.remove(_test_path)
|
| 37 |
+
except (OSError, PermissionError):
|
| 38 |
+
OUTPUT_DIR = os.path.join(tempfile.gettempdir(), "topic_modeler_outputs")
|
| 39 |
+
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 40 |
+
|
| 41 |
+
# ---------------------------------------------------------------------------
|
| 42 |
+
# Page configuration
|
| 43 |
+
# ---------------------------------------------------------------------------
|
| 44 |
+
st.set_page_config(
|
| 45 |
+
page_title="Research Topic Modeler — AI Agent",
|
| 46 |
+
page_icon="🔬",
|
| 47 |
+
layout="wide",
|
| 48 |
+
initial_sidebar_state="expanded",
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
# ---------------------------------------------------------------------------
|
| 52 |
+
# Custom CSS for a polished, professional look with dark-safe text colors
|
| 53 |
+
# ---------------------------------------------------------------------------
|
| 54 |
+
st.markdown("""
|
| 55 |
+
<style>
|
| 56 |
+
/* Import Google Font */
|
| 57 |
+
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
|
| 58 |
+
|
| 59 |
+
/* Global */
|
| 60 |
+
html, body, [class*="css"] {
|
| 61 |
+
font-family: 'Inter', sans-serif;
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
/* Header gradient banner */
|
| 65 |
+
.main-header {
|
| 66 |
+
background: linear-gradient(135deg, #0f0c29 0%, #302b63 50%, #24243e 100%);
|
| 67 |
+
padding: 2rem 2.5rem;
|
| 68 |
+
border-radius: 16px;
|
| 69 |
+
margin-bottom: 1.5rem;
|
| 70 |
+
box-shadow: 0 8px 32px rgba(48, 43, 99, 0.3);
|
| 71 |
+
}
|
| 72 |
+
.main-header h1 {
|
| 73 |
+
color: #ffffff;
|
| 74 |
+
font-size: 2.2rem;
|
| 75 |
+
font-weight: 700;
|
| 76 |
+
margin: 0;
|
| 77 |
+
letter-spacing: -0.5px;
|
| 78 |
+
}
|
| 79 |
+
.main-header p {
|
| 80 |
+
color: #b8b5ff;
|
| 81 |
+
font-size: 1.05rem;
|
| 82 |
+
margin: 0.5rem 0 0 0;
|
| 83 |
+
font-weight: 300;
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
/* Stat cards */
|
| 87 |
+
.stat-card {
|
| 88 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 89 |
+
padding: 1.25rem 1.5rem;
|
| 90 |
+
border-radius: 12px;
|
| 91 |
+
color: white;
|
| 92 |
+
text-align: center;
|
| 93 |
+
box-shadow: 0 4px 15px rgba(102, 126, 234, 0.3);
|
| 94 |
+
transition: transform 0.2s ease;
|
| 95 |
+
}
|
| 96 |
+
.stat-card:hover {
|
| 97 |
+
transform: translateY(-2px);
|
| 98 |
+
}
|
| 99 |
+
.stat-card .stat-value {
|
| 100 |
+
font-size: 2rem;
|
| 101 |
+
font-weight: 700;
|
| 102 |
+
line-height: 1.2;
|
| 103 |
+
color: #ffffff;
|
| 104 |
+
}
|
| 105 |
+
.stat-card .stat-label {
|
| 106 |
+
font-size: 0.85rem;
|
| 107 |
+
opacity: 0.85;
|
| 108 |
+
margin-top: 0.3rem;
|
| 109 |
+
font-weight: 400;
|
| 110 |
+
color: #e8e6ff;
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
/* Status badge */
|
| 114 |
+
.status-badge {
|
| 115 |
+
display: inline-block;
|
| 116 |
+
padding: 0.3rem 1rem;
|
| 117 |
+
border-radius: 20px;
|
| 118 |
+
font-size: 0.8rem;
|
| 119 |
+
font-weight: 600;
|
| 120 |
+
text-transform: uppercase;
|
| 121 |
+
letter-spacing: 0.5px;
|
| 122 |
+
}
|
| 123 |
+
.status-success {
|
| 124 |
+
background: linear-gradient(135deg, #11998e, #38ef7d);
|
| 125 |
+
color: #ffffff;
|
| 126 |
+
}
|
| 127 |
+
.status-failed {
|
| 128 |
+
background: linear-gradient(135deg, #eb3349, #f45c43);
|
| 129 |
+
color: #ffffff;
|
| 130 |
+
}
|
| 131 |
+
.status-running {
|
| 132 |
+
background: linear-gradient(135deg, #f7971e, #ffd200);
|
| 133 |
+
color: #1a1a2e;
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
/* Section headers — always readable on both light and dark backgrounds */
|
| 137 |
+
.section-header {
|
| 138 |
+
font-size: 1.3rem;
|
| 139 |
+
font-weight: 600;
|
| 140 |
+
color: #c4b5fd;
|
| 141 |
+
margin: 1.5rem 0 0.75rem 0;
|
| 142 |
+
padding-bottom: 0.5rem;
|
| 143 |
+
border-bottom: 2px solid #667eea;
|
| 144 |
+
display: inline-block;
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
/* Taxonomy badges */
|
| 148 |
+
.mapped-badge {
|
| 149 |
+
display: inline-block;
|
| 150 |
+
background: linear-gradient(135deg, #11998e, #38ef7d);
|
| 151 |
+
color: #ffffff;
|
| 152 |
+
padding: 0.2rem 0.7rem;
|
| 153 |
+
border-radius: 12px;
|
| 154 |
+
font-size: 0.75rem;
|
| 155 |
+
font-weight: 600;
|
| 156 |
+
}
|
| 157 |
+
.novel-badge {
|
| 158 |
+
display: inline-block;
|
| 159 |
+
background: linear-gradient(135deg, #fc4a1a, #f7b733);
|
| 160 |
+
color: #ffffff;
|
| 161 |
+
padding: 0.2rem 0.7rem;
|
| 162 |
+
border-radius: 12px;
|
| 163 |
+
font-size: 0.75rem;
|
| 164 |
+
font-weight: 600;
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
/* Sidebar styling */
|
| 168 |
+
section[data-testid="stSidebar"] {
|
| 169 |
+
background: linear-gradient(180deg, #1a1a2e 0%, #16213e 100%);
|
| 170 |
+
}
|
| 171 |
+
section[data-testid="stSidebar"] .stMarkdown {
|
| 172 |
+
color: #e0e0e0;
|
| 173 |
+
}
|
| 174 |
+
section[data-testid="stSidebar"] label {
|
| 175 |
+
color: #e0e0e0 !important;
|
| 176 |
+
}
|
| 177 |
+
section[data-testid="stSidebar"] .stSlider label {
|
| 178 |
+
color: #e0e0e0 !important;
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
/* Data table enhancements */
|
| 182 |
+
.stDataFrame {
|
| 183 |
+
border-radius: 8px;
|
| 184 |
+
overflow: hidden;
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
+
/* Info box — dark-safe: dark background with light text */
|
| 188 |
+
.info-box {
|
| 189 |
+
background: linear-gradient(135deg, #1e1e3f 0%, #2d2b55 100%);
|
| 190 |
+
padding: 1rem 1.5rem;
|
| 191 |
+
border-radius: 10px;
|
| 192 |
+
border-left: 4px solid #667eea;
|
| 193 |
+
margin: 0.75rem 0;
|
| 194 |
+
color: #e0e0e0;
|
| 195 |
+
}
|
| 196 |
+
.info-box strong {
|
| 197 |
+
color: #ffffff;
|
| 198 |
+
}
|
| 199 |
+
.info-box code {
|
| 200 |
+
background: rgba(102, 126, 234, 0.2);
|
| 201 |
+
color: #b8b5ff;
|
| 202 |
+
padding: 0.1rem 0.4rem;
|
| 203 |
+
border-radius: 4px;
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
/* Pipeline step */
|
| 207 |
+
.step-item {
|
| 208 |
+
padding: 0.5rem 1rem;
|
| 209 |
+
margin: 0.3rem 0;
|
| 210 |
+
border-radius: 8px;
|
| 211 |
+
background: rgba(102, 126, 234, 0.15);
|
| 212 |
+
border-left: 3px solid #667eea;
|
| 213 |
+
font-size: 0.9rem;
|
| 214 |
+
color: #e0e0e0;
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
/* Chart container styling */
|
| 218 |
+
.chart-container {
|
| 219 |
+
background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
|
| 220 |
+
border-radius: 12px;
|
| 221 |
+
padding: 1rem;
|
| 222 |
+
margin: 0.5rem 0;
|
| 223 |
+
box-shadow: 0 4px 15px rgba(0, 0, 0, 0.2);
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
/* Review section header */
|
| 227 |
+
.review-header {
|
| 228 |
+
background: linear-gradient(135deg, #11998e 0%, #38ef7d 100%);
|
| 229 |
+
padding: 1rem 1.5rem;
|
| 230 |
+
border-radius: 12px;
|
| 231 |
+
margin-bottom: 1rem;
|
| 232 |
+
box-shadow: 0 4px 15px rgba(17, 153, 142, 0.3);
|
| 233 |
+
}
|
| 234 |
+
.review-header h3 {
|
| 235 |
+
color: #ffffff;
|
| 236 |
+
margin: 0;
|
| 237 |
+
font-weight: 600;
|
| 238 |
+
}
|
| 239 |
+
.review-header p {
|
| 240 |
+
color: #e0fff8;
|
| 241 |
+
margin: 0.3rem 0 0 0;
|
| 242 |
+
font-size: 0.9rem;
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
/* Save confirmation */
|
| 246 |
+
.save-confirm {
|
| 247 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 248 |
+
color: #ffffff;
|
| 249 |
+
padding: 0.75rem 1.25rem;
|
| 250 |
+
border-radius: 10px;
|
| 251 |
+
margin-top: 0.5rem;
|
| 252 |
+
font-weight: 500;
|
| 253 |
+
}
|
| 254 |
+
|
| 255 |
+
/* Ensure tab labels are readable */
|
| 256 |
+
.stTabs [data-baseweb="tab-list"] button {
|
| 257 |
+
color: #c4b5fd;
|
| 258 |
+
}
|
| 259 |
+
.stTabs [data-baseweb="tab-list"] button[aria-selected="true"] {
|
| 260 |
+
color: #ffffff;
|
| 261 |
+
}
|
| 262 |
+
</style>
|
| 263 |
+
""", unsafe_allow_html=True)
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
# ---------------------------------------------------------------------------
|
| 267 |
+
# Header
|
| 268 |
+
# ---------------------------------------------------------------------------
|
| 269 |
+
st.markdown("""
|
| 270 |
+
<div class="main-header">
|
| 271 |
+
<h1>🔬 Research Topic Modeler</h1>
|
| 272 |
+
<p>AI-powered topic modeling agent for research papers — discover, compare, and classify themes across Titles and Abstracts</p>
|
| 273 |
+
</div>
|
| 274 |
+
""", unsafe_allow_html=True)
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
# ---------------------------------------------------------------------------
|
| 278 |
+
# Sidebar — Configuration
|
| 279 |
+
# ---------------------------------------------------------------------------
|
| 280 |
+
with st.sidebar:
|
| 281 |
+
st.markdown("## ⚙️ Configuration")
|
| 282 |
+
st.markdown("---")
|
| 283 |
+
|
| 284 |
+
# File upload
|
| 285 |
+
st.markdown("### 📁 Dataset")
|
| 286 |
+
uploaded_file = st.file_uploader(
|
| 287 |
+
"Upload CSV with Title & Abstract columns",
|
| 288 |
+
type=["csv"],
|
| 289 |
+
help="The CSV must contain at least 'Title' and 'Abstract' columns.",
|
| 290 |
+
)
|
| 291 |
+
|
| 292 |
+
# Or use default dataset
|
| 293 |
+
use_default = st.checkbox(
|
| 294 |
+
"Use default dataset (dataset.csv)",
|
| 295 |
+
value=True if not uploaded_file else False,
|
| 296 |
+
help="Use the bundled dataset.csv file in the project directory.",
|
| 297 |
+
)
|
| 298 |
+
|
| 299 |
+
st.markdown("---")
|
| 300 |
+
st.markdown("### 🎯 Parameters")
|
| 301 |
+
|
| 302 |
+
min_topics = st.slider(
|
| 303 |
+
"Minimum Topics",
|
| 304 |
+
min_value=50,
|
| 305 |
+
max_value=200,
|
| 306 |
+
value=100,
|
| 307 |
+
step=10,
|
| 308 |
+
help="Minimum number of topics to generate per source (Titles / Abstracts).",
|
| 309 |
+
)
|
| 310 |
+
|
| 311 |
+
use_llm = st.checkbox(
|
| 312 |
+
"🤖 Use LLM for Label Generation (Groq)",
|
| 313 |
+
value=False,
|
| 314 |
+
help="Use Groq's LLaMA model to generate contextual topic labels. "
|
| 315 |
+
"Falls back to keyword heuristic if unchecked.",
|
| 316 |
+
)
|
| 317 |
+
|
| 318 |
+
groq_key = os.environ.get("GROQ_API_KEY", "")
|
| 319 |
+
if use_llm:
|
| 320 |
+
groq_key = st.text_input(
|
| 321 |
+
"Groq API Key",
|
| 322 |
+
value=groq_key,
|
| 323 |
+
type="password",
|
| 324 |
+
help="Your Groq API key for LLM label generation.",
|
| 325 |
+
)
|
| 326 |
+
|
| 327 |
+
st.markdown("---")
|
| 328 |
+
st.markdown("### 📋 Pipeline Steps")
|
| 329 |
+
steps_info = [
|
| 330 |
+
"1. Load & validate CSV",
|
| 331 |
+
"2. Preprocess text (Titles + Abstracts)",
|
| 332 |
+
"3. Topic modeling — Titles (≥{} topics)".format(min_topics),
|
| 333 |
+
"4. Topic modeling — Abstracts (≥{} topics)".format(min_topics),
|
| 334 |
+
"5. Generate human-readable labels",
|
| 335 |
+
"6. Combine topics table",
|
| 336 |
+
"7. Compare themes (Title vs Abstract)",
|
| 337 |
+
"8. Build taxonomy map (MAPPED / NOVEL)",
|
| 338 |
+
"9. Export outputs (CSV, JSON)",
|
| 339 |
+
]
|
| 340 |
+
for step in steps_info:
|
| 341 |
+
st.markdown(f'<div class="step-item">{step}</div>', unsafe_allow_html=True)
|
| 342 |
+
|
| 343 |
+
|
| 344 |
+
# ---------------------------------------------------------------------------
|
| 345 |
+
# Main area — Run button and results
|
| 346 |
+
# ---------------------------------------------------------------------------
|
| 347 |
+
col_run, col_status = st.columns([2, 3])
|
| 348 |
+
|
| 349 |
+
with col_run:
|
| 350 |
+
run_clicked = st.button("🚀 Run Topic Modeling Agent", use_container_width=True, type="primary")
|
| 351 |
+
|
| 352 |
+
with col_status:
|
| 353 |
+
if "result" in st.session_state and st.session_state.result is not None:
|
| 354 |
+
res = st.session_state.result
|
| 355 |
+
if res.status == "success":
|
| 356 |
+
st.markdown('<span class="status-badge status-success">✓ Pipeline Complete</span>', unsafe_allow_html=True)
|
| 357 |
+
elif res.status == "failed":
|
| 358 |
+
st.markdown('<span class="status-badge status-failed">✗ Pipeline Failed</span>', unsafe_allow_html=True)
|
| 359 |
+
else:
|
| 360 |
+
st.markdown('<span class="status-badge status-running">● Awaiting Input</span>', unsafe_allow_html=True)
|
| 361 |
+
|
| 362 |
+
|
| 363 |
+
# ---------------------------------------------------------------------------
|
| 364 |
+
# Execute pipeline
|
| 365 |
+
# ---------------------------------------------------------------------------
|
| 366 |
+
if run_clicked:
|
| 367 |
+
# Determine CSV path
|
| 368 |
+
csv_path = None
|
| 369 |
+
if uploaded_file is not None:
|
| 370 |
+
# Save uploaded file to a temp location
|
| 371 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv", dir=".") as tmp:
|
| 372 |
+
tmp.write(uploaded_file.getvalue())
|
| 373 |
+
csv_path = tmp.name
|
| 374 |
+
elif use_default:
|
| 375 |
+
csv_path = "dataset.csv"
|
| 376 |
+
if not os.path.exists(csv_path):
|
| 377 |
+
st.error("❌ Default dataset.csv not found in the project directory.")
|
| 378 |
+
st.stop()
|
| 379 |
+
else:
|
| 380 |
+
st.error("❌ Please upload a CSV file or select the default dataset.")
|
| 381 |
+
st.stop()
|
| 382 |
+
|
| 383 |
+
# Run the agent
|
| 384 |
+
with st.spinner("🔄 Running the Topic Modeling Agent … this may take a few minutes."):
|
| 385 |
+
progress = st.progress(0, text="Initializing …")
|
| 386 |
+
|
| 387 |
+
agent = TopicAgent(
|
| 388 |
+
csv_path=csv_path,
|
| 389 |
+
output_dir=OUTPUT_DIR,
|
| 390 |
+
min_topics=min_topics,
|
| 391 |
+
use_llm_labels=use_llm,
|
| 392 |
+
groq_api_key=groq_key if use_llm else None,
|
| 393 |
+
)
|
| 394 |
+
|
| 395 |
+
# Display step-by-step progress
|
| 396 |
+
progress.progress(5, text="Step 1/9: Loading CSV …")
|
| 397 |
+
agent._step_load_csv()
|
| 398 |
+
progress.progress(10, text="Step 2/9: Preprocessing text …")
|
| 399 |
+
agent._step_preprocess()
|
| 400 |
+
progress.progress(20, text="Step 3/9: Topic modeling on Titles …")
|
| 401 |
+
agent._step_model_titles()
|
| 402 |
+
progress.progress(45, text="Step 4/9: Topic modeling on Abstracts …")
|
| 403 |
+
agent._step_model_abstracts()
|
| 404 |
+
progress.progress(65, text="Step 5/9: Generating topic labels …")
|
| 405 |
+
agent._step_generate_labels()
|
| 406 |
+
progress.progress(75, text="Step 6/9: Building combined topics table …")
|
| 407 |
+
agent._step_combine_topics()
|
| 408 |
+
progress.progress(80, text="Step 7/9: Comparing themes …")
|
| 409 |
+
agent._step_compare_themes()
|
| 410 |
+
progress.progress(90, text="Step 8/9: Building taxonomy map …")
|
| 411 |
+
agent._step_taxonomy_map()
|
| 412 |
+
progress.progress(95, text="Step 9/9: Exporting outputs …")
|
| 413 |
+
agent._step_export()
|
| 414 |
+
|
| 415 |
+
agent._result.status = "success"
|
| 416 |
+
progress.progress(100, text="✅ Pipeline complete!")
|
| 417 |
+
|
| 418 |
+
st.session_state.result = agent._result
|
| 419 |
+
|
| 420 |
+
# Clean up temp file
|
| 421 |
+
if uploaded_file is not None and csv_path and os.path.exists(csv_path):
|
| 422 |
+
try:
|
| 423 |
+
os.unlink(csv_path)
|
| 424 |
+
except Exception:
|
| 425 |
+
pass
|
| 426 |
+
|
| 427 |
+
st.rerun()
|
| 428 |
+
|
| 429 |
+
|
| 430 |
+
# ---------------------------------------------------------------------------
|
| 431 |
+
# Helper: Plotly chart theme (dark background, readable text)
|
| 432 |
+
# ---------------------------------------------------------------------------
|
| 433 |
+
PLOTLY_LAYOUT = dict(
|
| 434 |
+
paper_bgcolor="rgba(26, 26, 46, 0.95)",
|
| 435 |
+
plot_bgcolor="rgba(22, 33, 62, 0.95)",
|
| 436 |
+
font=dict(family="Inter, sans-serif", size=13, color="#e0e0e0"),
|
| 437 |
+
title_font=dict(size=18, color="#ffffff"),
|
| 438 |
+
legend=dict(
|
| 439 |
+
font=dict(color="#e0e0e0"),
|
| 440 |
+
bgcolor="rgba(26, 26, 46, 0.7)",
|
| 441 |
+
bordercolor="#667eea",
|
| 442 |
+
borderwidth=1,
|
| 443 |
+
),
|
| 444 |
+
xaxis=dict(
|
| 445 |
+
gridcolor="rgba(102, 126, 234, 0.15)",
|
| 446 |
+
zerolinecolor="rgba(102, 126, 234, 0.25)",
|
| 447 |
+
tickfont=dict(color="#c4b5fd"),
|
| 448 |
+
title_font=dict(color="#e0e0e0"),
|
| 449 |
+
),
|
| 450 |
+
yaxis=dict(
|
| 451 |
+
gridcolor="rgba(102, 126, 234, 0.15)",
|
| 452 |
+
zerolinecolor="rgba(102, 126, 234, 0.25)",
|
| 453 |
+
tickfont=dict(color="#c4b5fd"),
|
| 454 |
+
title_font=dict(color="#e0e0e0"),
|
| 455 |
+
),
|
| 456 |
+
margin=dict(l=60, r=30, t=60, b=60),
|
| 457 |
+
)
|
| 458 |
+
|
| 459 |
+
# Gradient-like color sequence
|
| 460 |
+
CHART_COLORS = [
|
| 461 |
+
"#667eea", "#764ba2", "#f093fb", "#f5576c",
|
| 462 |
+
"#4facfe", "#00f2fe", "#43e97b", "#38f9d7",
|
| 463 |
+
"#fa709a", "#fee140", "#a18cd1", "#fbc2eb",
|
| 464 |
+
"#ff9a9e", "#fad0c4", "#ffecd2", "#fcb69f",
|
| 465 |
+
]
|
| 466 |
+
|
| 467 |
+
|
| 468 |
+
# ---------------------------------------------------------------------------
|
| 469 |
+
# Display results
|
| 470 |
+
# ---------------------------------------------------------------------------
|
| 471 |
+
if "result" in st.session_state and st.session_state.result is not None:
|
| 472 |
+
result = st.session_state.result
|
| 473 |
+
|
| 474 |
+
if result.status == "failed":
|
| 475 |
+
st.error(f"Pipeline failed with errors: {result.errors}")
|
| 476 |
+
st.stop()
|
| 477 |
+
|
| 478 |
+
# ---- Summary Statistics ----
|
| 479 |
+
st.markdown('<div class="section-header">📊 Summary Statistics</div>', unsafe_allow_html=True)
|
| 480 |
+
|
| 481 |
+
c1, c2, c3, c4, c5 = st.columns(5)
|
| 482 |
+
with c1:
|
| 483 |
+
st.markdown(f"""
|
| 484 |
+
<div class="stat-card">
|
| 485 |
+
<div class="stat-value">{len(result.title_topics)}</div>
|
| 486 |
+
<div class="stat-label">Title Topics</div>
|
| 487 |
+
</div>
|
| 488 |
+
""", unsafe_allow_html=True)
|
| 489 |
+
with c2:
|
| 490 |
+
st.markdown(f"""
|
| 491 |
+
<div class="stat-card">
|
| 492 |
+
<div class="stat-value">{len(result.abstract_topics)}</div>
|
| 493 |
+
<div class="stat-label">Abstract Topics</div>
|
| 494 |
+
</div>
|
| 495 |
+
""", unsafe_allow_html=True)
|
| 496 |
+
with c3:
|
| 497 |
+
st.markdown(f"""
|
| 498 |
+
<div class="stat-card">
|
| 499 |
+
<div class="stat-value">{len(result.combined_topics)}</div>
|
| 500 |
+
<div class="stat-label">Total Topics</div>
|
| 501 |
+
</div>
|
| 502 |
+
""", unsafe_allow_html=True)
|
| 503 |
+
with c4:
|
| 504 |
+
mapped_count = result.taxonomy_map.get("metadata", {}).get("mapped_count", 0)
|
| 505 |
+
st.markdown(f"""
|
| 506 |
+
<div class="stat-card">
|
| 507 |
+
<div class="stat-value">{mapped_count}</div>
|
| 508 |
+
<div class="stat-label">Mapped Themes</div>
|
| 509 |
+
</div>
|
| 510 |
+
""", unsafe_allow_html=True)
|
| 511 |
+
with c5:
|
| 512 |
+
novel_count = result.taxonomy_map.get("metadata", {}).get("novel_count", 0)
|
| 513 |
+
st.markdown(f"""
|
| 514 |
+
<div class="stat-card">
|
| 515 |
+
<div class="stat-value">{novel_count}</div>
|
| 516 |
+
<div class="stat-label">Novel Themes</div>
|
| 517 |
+
</div>
|
| 518 |
+
""", unsafe_allow_html=True)
|
| 519 |
+
|
| 520 |
+
st.markdown("<br>", unsafe_allow_html=True)
|
| 521 |
+
|
| 522 |
+
# ---- Tabbed Results ----
|
| 523 |
+
tab1, tab2, tab3, tab4, tab5, tab_review, tab_charts = st.tabs([
|
| 524 |
+
"📋 Topics Table",
|
| 525 |
+
"🔬 Title Topics",
|
| 526 |
+
"📄 Abstract Topics",
|
| 527 |
+
"⚖️ Theme Comparison",
|
| 528 |
+
"🗺️ Taxonomy Map",
|
| 529 |
+
"✏️ Review Table",
|
| 530 |
+
"📈 Charts",
|
| 531 |
+
])
|
| 532 |
+
|
| 533 |
+
# Tab 1: Combined Topics Table
|
| 534 |
+
with tab1:
|
| 535 |
+
st.markdown('<div class="section-header">Combined Topics Table</div>', unsafe_allow_html=True)
|
| 536 |
+
st.markdown(f"Showing all **{len(result.combined_topics)}** topics from both Titles and Abstracts.")
|
| 537 |
+
|
| 538 |
+
# Filter controls
|
| 539 |
+
fcol1, fcol2 = st.columns(2)
|
| 540 |
+
with fcol1:
|
| 541 |
+
source_filter = st.multiselect(
|
| 542 |
+
"Filter by Source",
|
| 543 |
+
options=result.combined_topics["source"].unique().tolist(),
|
| 544 |
+
default=result.combined_topics["source"].unique().tolist(),
|
| 545 |
+
)
|
| 546 |
+
with fcol2:
|
| 547 |
+
search_term = st.text_input("🔍 Search keywords", "")
|
| 548 |
+
|
| 549 |
+
display_df = result.combined_topics[result.combined_topics["source"].isin(source_filter)]
|
| 550 |
+
if search_term:
|
| 551 |
+
mask = display_df["keywords"].str.contains(search_term, case=False, na=False)
|
| 552 |
+
mask |= display_df["label"].str.contains(search_term, case=False, na=False)
|
| 553 |
+
display_df = display_df[mask]
|
| 554 |
+
|
| 555 |
+
st.dataframe(
|
| 556 |
+
display_df,
|
| 557 |
+
use_container_width=True,
|
| 558 |
+
height=500,
|
| 559 |
+
column_config={
|
| 560 |
+
"topic_id": st.column_config.NumberColumn("Topic ID", width="small"),
|
| 561 |
+
"keywords": st.column_config.TextColumn("Keywords", width="large"),
|
| 562 |
+
"label": st.column_config.TextColumn("Label", width="medium"),
|
| 563 |
+
"source": st.column_config.TextColumn("Source", width="small"),
|
| 564 |
+
},
|
| 565 |
+
)
|
| 566 |
+
|
| 567 |
+
# Tab 2: Title Topics
|
| 568 |
+
with tab2:
|
| 569 |
+
st.markdown('<div class="section-header">Title Topics</div>', unsafe_allow_html=True)
|
| 570 |
+
st.markdown(f"**{len(result.title_topics)}** topics discovered from paper titles.")
|
| 571 |
+
st.dataframe(result.title_topics, use_container_width=True, height=500)
|
| 572 |
+
|
| 573 |
+
# Tab 3: Abstract Topics
|
| 574 |
+
with tab3:
|
| 575 |
+
st.markdown('<div class="section-header">Abstract Topics</div>', unsafe_allow_html=True)
|
| 576 |
+
st.markdown(f"**{len(result.abstract_topics)}** topics discovered from paper abstracts.")
|
| 577 |
+
st.dataframe(result.abstract_topics, use_container_width=True, height=500)
|
| 578 |
+
|
| 579 |
+
# Tab 4: Theme Comparison
|
| 580 |
+
with tab4:
|
| 581 |
+
st.markdown('<div class="section-header">Theme Comparison: Titles vs Abstracts</div>', unsafe_allow_html=True)
|
| 582 |
+
|
| 583 |
+
if not result.comparison.empty:
|
| 584 |
+
# Alignment distribution
|
| 585 |
+
align_counts = result.comparison["alignment"].value_counts()
|
| 586 |
+
acol1, acol2, acol3, acol4 = st.columns(4)
|
| 587 |
+
for col, alignment in zip(
|
| 588 |
+
[acol1, acol2, acol3, acol4],
|
| 589 |
+
["Strong", "Moderate", "Weak", "No Match"],
|
| 590 |
+
):
|
| 591 |
+
with col:
|
| 592 |
+
count = align_counts.get(alignment, 0)
|
| 593 |
+
st.metric(label=f"{alignment} Alignment", value=count)
|
| 594 |
+
|
| 595 |
+
st.markdown("<br>", unsafe_allow_html=True)
|
| 596 |
+
|
| 597 |
+
# Filter by alignment
|
| 598 |
+
alignment_filter = st.multiselect(
|
| 599 |
+
"Filter by Alignment",
|
| 600 |
+
options=["Strong", "Moderate", "Weak", "No Match"],
|
| 601 |
+
default=["Strong", "Moderate", "Weak", "No Match"],
|
| 602 |
+
)
|
| 603 |
+
filtered_comp = result.comparison[result.comparison["alignment"].isin(alignment_filter)]
|
| 604 |
+
|
| 605 |
+
st.dataframe(
|
| 606 |
+
filtered_comp,
|
| 607 |
+
use_container_width=True,
|
| 608 |
+
height=500,
|
| 609 |
+
column_config={
|
| 610 |
+
"similarity": st.column_config.ProgressColumn(
|
| 611 |
+
"Similarity",
|
| 612 |
+
min_value=0,
|
| 613 |
+
max_value=1,
|
| 614 |
+
format="%.2f",
|
| 615 |
+
),
|
| 616 |
+
},
|
| 617 |
+
)
|
| 618 |
+
else:
|
| 619 |
+
st.info("No comparison data available.")
|
| 620 |
+
|
| 621 |
+
# Tab 5: Taxonomy Map
|
| 622 |
+
with tab5:
|
| 623 |
+
st.markdown('<div class="section-header">Taxonomy Map</div>', unsafe_allow_html=True)
|
| 624 |
+
|
| 625 |
+
taxonomy = result.taxonomy_map
|
| 626 |
+
meta = taxonomy.get("metadata", {})
|
| 627 |
+
|
| 628 |
+
st.markdown(f"""
|
| 629 |
+
<div class="info-box">
|
| 630 |
+
<strong>Classification Summary:</strong><br>
|
| 631 |
+
Total Topics: <strong>{meta.get('total_topics', 0)}</strong> |
|
| 632 |
+
<span class="mapped-badge">MAPPED: {meta.get('mapped_count', 0)}</span> |
|
| 633 |
+
<span class="novel-badge">NOVEL: {meta.get('novel_count', 0)}</span> |
|
| 634 |
+
Threshold: {meta.get('threshold', 0.15)}
|
| 635 |
+
</div>
|
| 636 |
+
""", unsafe_allow_html=True)
|
| 637 |
+
|
| 638 |
+
tax_tab1, tax_tab2 = st.tabs(["✅ Mapped Themes", "🆕 Novel Themes"])
|
| 639 |
+
|
| 640 |
+
with tax_tab1:
|
| 641 |
+
mapped_list = taxonomy.get("mapped", [])
|
| 642 |
+
if mapped_list:
|
| 643 |
+
mapped_df = pd.DataFrame(mapped_list)
|
| 644 |
+
st.dataframe(
|
| 645 |
+
mapped_df,
|
| 646 |
+
use_container_width=True,
|
| 647 |
+
height=400,
|
| 648 |
+
column_config={
|
| 649 |
+
"score": st.column_config.ProgressColumn(
|
| 650 |
+
"Match Score",
|
| 651 |
+
min_value=0,
|
| 652 |
+
max_value=1,
|
| 653 |
+
format="%.3f",
|
| 654 |
+
),
|
| 655 |
+
},
|
| 656 |
+
)
|
| 657 |
+
else:
|
| 658 |
+
st.info("No mapped themes found.")
|
| 659 |
+
|
| 660 |
+
with tax_tab2:
|
| 661 |
+
novel_list = taxonomy.get("novel", [])
|
| 662 |
+
if novel_list:
|
| 663 |
+
novel_df = pd.DataFrame(novel_list)
|
| 664 |
+
st.dataframe(
|
| 665 |
+
novel_df,
|
| 666 |
+
use_container_width=True,
|
| 667 |
+
height=400,
|
| 668 |
+
column_config={
|
| 669 |
+
"score": st.column_config.ProgressColumn(
|
| 670 |
+
"Match Score",
|
| 671 |
+
min_value=0,
|
| 672 |
+
max_value=1,
|
| 673 |
+
format="%.3f",
|
| 674 |
+
),
|
| 675 |
+
},
|
| 676 |
+
)
|
| 677 |
+
else:
|
| 678 |
+
st.info("No novel themes found.")
|
| 679 |
+
|
| 680 |
+
# ==================================================================
|
| 681 |
+
# Tab 6: Editable Review Table
|
| 682 |
+
# ==================================================================
|
| 683 |
+
with tab_review:
|
| 684 |
+
st.markdown("""
|
| 685 |
+
<div class="review-header">
|
| 686 |
+
<h3>✏️ Topic Review Table</h3>
|
| 687 |
+
<p>Review, approve, rename, and annotate each topic. Changes are saved to outputs/review_table.csv.</p>
|
| 688 |
+
</div>
|
| 689 |
+
""", unsafe_allow_html=True)
|
| 690 |
+
|
| 691 |
+
# Build review dataframe from combined topics
|
| 692 |
+
# Load existing review table if available to preserve edits
|
| 693 |
+
review_csv_path = os.path.join(OUTPUT_DIR, "review_table.csv")
|
| 694 |
+
|
| 695 |
+
if "review_df" not in st.session_state:
|
| 696 |
+
if os.path.exists(review_csv_path):
|
| 697 |
+
# Load previously saved review table
|
| 698 |
+
existing_review = pd.read_csv(review_csv_path)
|
| 699 |
+
# Merge with current topics to ensure all topics are represented
|
| 700 |
+
current_ids = set(result.combined_topics["topic_id"].tolist())
|
| 701 |
+
existing_ids = set(existing_review["topic_id"].tolist()) if "topic_id" in existing_review.columns else set()
|
| 702 |
+
|
| 703 |
+
if current_ids == existing_ids or existing_ids.issuperset(current_ids):
|
| 704 |
+
st.session_state.review_df = existing_review
|
| 705 |
+
else:
|
| 706 |
+
# Rebuild from current topics, but preserve existing edits
|
| 707 |
+
review_data = []
|
| 708 |
+
for _, row in result.combined_topics.iterrows():
|
| 709 |
+
review_data.append({
|
| 710 |
+
"topic_id": int(row["topic_id"]),
|
| 711 |
+
"label": row.get("label", ""),
|
| 712 |
+
"keywords": row.get("keywords", ""),
|
| 713 |
+
"source": row.get("source", ""),
|
| 714 |
+
"approve": False,
|
| 715 |
+
"rename_to": "",
|
| 716 |
+
"reasoning": "",
|
| 717 |
+
})
|
| 718 |
+
new_review_df = pd.DataFrame(review_data)
|
| 719 |
+
# Merge existing edits
|
| 720 |
+
if not existing_review.empty and "topic_id" in existing_review.columns:
|
| 721 |
+
for _, erow in existing_review.iterrows():
|
| 722 |
+
mask = new_review_df["topic_id"] == erow["topic_id"]
|
| 723 |
+
if mask.any():
|
| 724 |
+
if "approve" in erow:
|
| 725 |
+
new_review_df.loc[mask, "approve"] = erow["approve"]
|
| 726 |
+
if "rename_to" in erow and pd.notna(erow["rename_to"]):
|
| 727 |
+
new_review_df.loc[mask, "rename_to"] = erow["rename_to"]
|
| 728 |
+
if "reasoning" in erow and pd.notna(erow["reasoning"]):
|
| 729 |
+
new_review_df.loc[mask, "reasoning"] = erow["reasoning"]
|
| 730 |
+
st.session_state.review_df = new_review_df
|
| 731 |
+
else:
|
| 732 |
+
# Build fresh review table
|
| 733 |
+
review_data = []
|
| 734 |
+
for _, row in result.combined_topics.iterrows():
|
| 735 |
+
review_data.append({
|
| 736 |
+
"topic_id": int(row["topic_id"]),
|
| 737 |
+
"label": row.get("label", ""),
|
| 738 |
+
"keywords": row.get("keywords", ""),
|
| 739 |
+
"source": row.get("source", ""),
|
| 740 |
+
"approve": False,
|
| 741 |
+
"rename_to": "",
|
| 742 |
+
"reasoning": "",
|
| 743 |
+
})
|
| 744 |
+
st.session_state.review_df = pd.DataFrame(review_data)
|
| 745 |
+
|
| 746 |
+
# Filter controls for review table
|
| 747 |
+
rv_col1, rv_col2, rv_col3 = st.columns(3)
|
| 748 |
+
with rv_col1:
|
| 749 |
+
review_source_filter = st.multiselect(
|
| 750 |
+
"Filter by Source",
|
| 751 |
+
options=st.session_state.review_df["source"].unique().tolist(),
|
| 752 |
+
default=st.session_state.review_df["source"].unique().tolist(),
|
| 753 |
+
key="review_source_filter",
|
| 754 |
+
)
|
| 755 |
+
with rv_col2:
|
| 756 |
+
review_search = st.text_input("🔍 Search in review table", "", key="review_search")
|
| 757 |
+
with rv_col3:
|
| 758 |
+
review_approval_filter = st.selectbox(
|
| 759 |
+
"Show",
|
| 760 |
+
options=["All Topics", "Approved Only", "Not Approved"],
|
| 761 |
+
index=0,
|
| 762 |
+
key="review_approval_filter",
|
| 763 |
+
)
|
| 764 |
+
|
| 765 |
+
# Apply filters
|
| 766 |
+
filtered_review = st.session_state.review_df[
|
| 767 |
+
st.session_state.review_df["source"].isin(review_source_filter)
|
| 768 |
+
]
|
| 769 |
+
if review_search:
|
| 770 |
+
search_mask = (
|
| 771 |
+
filtered_review["keywords"].str.contains(review_search, case=False, na=False) |
|
| 772 |
+
filtered_review["label"].str.contains(review_search, case=False, na=False)
|
| 773 |
+
)
|
| 774 |
+
filtered_review = filtered_review[search_mask]
|
| 775 |
+
if review_approval_filter == "Approved Only":
|
| 776 |
+
filtered_review = filtered_review[filtered_review["approve"] == True]
|
| 777 |
+
elif review_approval_filter == "Not Approved":
|
| 778 |
+
filtered_review = filtered_review[filtered_review["approve"] == False]
|
| 779 |
+
|
| 780 |
+
# Editable data editor
|
| 781 |
+
edited_df = st.data_editor(
|
| 782 |
+
filtered_review,
|
| 783 |
+
use_container_width=True,
|
| 784 |
+
height=500,
|
| 785 |
+
num_rows="fixed",
|
| 786 |
+
key="review_editor",
|
| 787 |
+
column_config={
|
| 788 |
+
"topic_id": st.column_config.NumberColumn(
|
| 789 |
+
"Topic ID", width="small", disabled=True
|
| 790 |
+
),
|
| 791 |
+
"label": st.column_config.TextColumn(
|
| 792 |
+
"Label", width="medium",
|
| 793 |
+
),
|
| 794 |
+
"keywords": st.column_config.TextColumn(
|
| 795 |
+
"Keywords", width="large", disabled=True,
|
| 796 |
+
),
|
| 797 |
+
"source": st.column_config.TextColumn(
|
| 798 |
+
"Source", width="small", disabled=True,
|
| 799 |
+
),
|
| 800 |
+
"approve": st.column_config.CheckboxColumn(
|
| 801 |
+
"✅ Approve", width="small", default=False,
|
| 802 |
+
),
|
| 803 |
+
"rename_to": st.column_config.TextColumn(
|
| 804 |
+
"Rename To", width="medium",
|
| 805 |
+
),
|
| 806 |
+
"reasoning": st.column_config.TextColumn(
|
| 807 |
+
"Reasoning / Notes", width="large",
|
| 808 |
+
),
|
| 809 |
+
},
|
| 810 |
+
column_order=["topic_id", "label", "keywords", "approve", "rename_to", "reasoning", "source"],
|
| 811 |
+
)
|
| 812 |
+
|
| 813 |
+
# Update session state with edits
|
| 814 |
+
if edited_df is not None:
|
| 815 |
+
# Merge edits back into the full review dataframe
|
| 816 |
+
for idx, erow in edited_df.iterrows():
|
| 817 |
+
mask = st.session_state.review_df.index == idx
|
| 818 |
+
if mask.any():
|
| 819 |
+
for col in ["label", "approve", "rename_to", "reasoning"]:
|
| 820 |
+
if col in erow:
|
| 821 |
+
st.session_state.review_df.loc[mask, col] = erow[col]
|
| 822 |
+
|
| 823 |
+
# Save button
|
| 824 |
+
sv_col1, sv_col2, sv_col3 = st.columns([1, 1, 2])
|
| 825 |
+
with sv_col1:
|
| 826 |
+
if st.button("💾 Save Review Table", use_container_width=True, type="primary"):
|
| 827 |
+
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 828 |
+
st.session_state.review_df.to_csv(review_csv_path, index=False)
|
| 829 |
+
st.markdown(
|
| 830 |
+
'<div class="save-confirm">✅ Review table saved to outputs/review_table.csv</div>',
|
| 831 |
+
unsafe_allow_html=True,
|
| 832 |
+
)
|
| 833 |
+
with sv_col2:
|
| 834 |
+
approved_count = int(st.session_state.review_df["approve"].sum()) if "approve" in st.session_state.review_df.columns else 0
|
| 835 |
+
total_count = len(st.session_state.review_df)
|
| 836 |
+
st.markdown(f"""
|
| 837 |
+
<div class="stat-card" style="padding: 0.75rem 1rem;">
|
| 838 |
+
<div class="stat-value" style="font-size: 1.4rem;">{approved_count}/{total_count}</div>
|
| 839 |
+
<div class="stat-label">Topics Approved</div>
|
| 840 |
+
</div>
|
| 841 |
+
""", unsafe_allow_html=True)
|
| 842 |
+
|
| 843 |
+
# ==================================================================
|
| 844 |
+
# Tab 7: Charts
|
| 845 |
+
# ==================================================================
|
| 846 |
+
with tab_charts:
|
| 847 |
+
st.markdown('<div class="section-header">📈 Topic Visualizations</div>', unsafe_allow_html=True)
|
| 848 |
+
|
| 849 |
+
# -----------------------------------------------------------
|
| 850 |
+
# Chart 1: Topic Frequency by Source
|
| 851 |
+
# -----------------------------------------------------------
|
| 852 |
+
st.markdown("#### 📊 Topic Frequency by Source")
|
| 853 |
+
st.caption("Number of topics discovered from each source (Titles vs Abstracts).")
|
| 854 |
+
|
| 855 |
+
source_counts = result.combined_topics["source"].value_counts().reset_index()
|
| 856 |
+
source_counts.columns = ["Source", "Count"]
|
| 857 |
+
|
| 858 |
+
fig1 = px.bar(
|
| 859 |
+
source_counts,
|
| 860 |
+
x="Source",
|
| 861 |
+
y="Count",
|
| 862 |
+
color="Source",
|
| 863 |
+
color_discrete_sequence=["#667eea", "#764ba2"],
|
| 864 |
+
text="Count",
|
| 865 |
+
)
|
| 866 |
+
fig1.update_traces(
|
| 867 |
+
textposition="outside",
|
| 868 |
+
textfont=dict(color="#e0e0e0", size=14, family="Inter"),
|
| 869 |
+
marker=dict(
|
| 870 |
+
line=dict(width=0),
|
| 871 |
+
),
|
| 872 |
+
)
|
| 873 |
+
fig1.update_layout(
|
| 874 |
+
**PLOTLY_LAYOUT,
|
| 875 |
+
title="Topic Count by Source",
|
| 876 |
+
xaxis_title="Source",
|
| 877 |
+
yaxis_title="Number of Topics",
|
| 878 |
+
showlegend=False,
|
| 879 |
+
height=420,
|
| 880 |
+
)
|
| 881 |
+
st.plotly_chart(fig1, use_container_width=True)
|
| 882 |
+
|
| 883 |
+
st.markdown("---")
|
| 884 |
+
|
| 885 |
+
# -----------------------------------------------------------
|
| 886 |
+
# Chart 2: Top Keywords Across All Topics
|
| 887 |
+
# -----------------------------------------------------------
|
| 888 |
+
st.markdown("#### 🔤 Top Keywords Across All Topics")
|
| 889 |
+
st.caption("Most frequently occurring keywords across all discovered topics.")
|
| 890 |
+
|
| 891 |
+
# Extract all keywords, count frequencies
|
| 892 |
+
all_keywords = []
|
| 893 |
+
for kw_str in result.combined_topics["keywords"].dropna():
|
| 894 |
+
for kw in kw_str.split(","):
|
| 895 |
+
kw_clean = kw.strip().lower()
|
| 896 |
+
if kw_clean and len(kw_clean) > 2:
|
| 897 |
+
all_keywords.append(kw_clean)
|
| 898 |
+
|
| 899 |
+
kw_counts = pd.Series(all_keywords).value_counts().head(25).reset_index()
|
| 900 |
+
kw_counts.columns = ["Keyword", "Frequency"]
|
| 901 |
+
|
| 902 |
+
fig2 = px.bar(
|
| 903 |
+
kw_counts,
|
| 904 |
+
x="Frequency",
|
| 905 |
+
y="Keyword",
|
| 906 |
+
orientation="h",
|
| 907 |
+
color="Frequency",
|
| 908 |
+
color_continuous_scale=["#302b63", "#667eea", "#f093fb", "#f5576c"],
|
| 909 |
+
)
|
| 910 |
+
fig2.update_traces(
|
| 911 |
+
marker=dict(line=dict(width=0)),
|
| 912 |
+
)
|
| 913 |
+
fig2.update_layout(
|
| 914 |
+
**PLOTLY_LAYOUT,
|
| 915 |
+
title="Top 25 Keywords by Frequency",
|
| 916 |
+
xaxis_title="Frequency (across all topics)",
|
| 917 |
+
yaxis_title="",
|
| 918 |
+
height=700,
|
| 919 |
+
coloraxis_colorbar=dict(
|
| 920 |
+
title="Freq",
|
| 921 |
+
tickfont=dict(color="#c4b5fd"),
|
| 922 |
+
title_font=dict(color="#e0e0e0"),
|
| 923 |
+
),
|
| 924 |
+
)
|
| 925 |
+
# Override yaxis separately to avoid duplicate keyword with PLOTLY_LAYOUT
|
| 926 |
+
fig2.update_layout(
|
| 927 |
+
yaxis=dict(
|
| 928 |
+
autorange="reversed",
|
| 929 |
+
gridcolor="rgba(102, 126, 234, 0.1)",
|
| 930 |
+
tickfont=dict(color="#c4b5fd", size=12),
|
| 931 |
+
),
|
| 932 |
+
)
|
| 933 |
+
st.plotly_chart(fig2, use_container_width=True)
|
| 934 |
+
|
| 935 |
+
st.markdown("---")
|
| 936 |
+
|
| 937 |
+
# -----------------------------------------------------------
|
| 938 |
+
# Chart 3: Taxonomy Distribution (Mapped vs Novel)
|
| 939 |
+
# -----------------------------------------------------------
|
| 940 |
+
st.markdown("#### 🧬 Taxonomy Classification Distribution")
|
| 941 |
+
st.caption("How topics are classified against the known research taxonomy.")
|
| 942 |
+
|
| 943 |
+
tax_meta = result.taxonomy_map.get("metadata", {})
|
| 944 |
+
tax_data = pd.DataFrame({
|
| 945 |
+
"Classification": ["MAPPED", "NOVEL"],
|
| 946 |
+
"Count": [tax_meta.get("mapped_count", 0), tax_meta.get("novel_count", 0)],
|
| 947 |
+
})
|
| 948 |
+
|
| 949 |
+
chart3_col1, chart3_col2 = st.columns(2)
|
| 950 |
+
|
| 951 |
+
with chart3_col1:
|
| 952 |
+
fig3a = px.pie(
|
| 953 |
+
tax_data,
|
| 954 |
+
values="Count",
|
| 955 |
+
names="Classification",
|
| 956 |
+
color="Classification",
|
| 957 |
+
color_discrete_map={
|
| 958 |
+
"MAPPED": "#38ef7d",
|
| 959 |
+
"NOVEL": "#f7b733",
|
| 960 |
+
},
|
| 961 |
+
hole=0.55,
|
| 962 |
+
)
|
| 963 |
+
fig3a.update_traces(
|
| 964 |
+
textfont=dict(color="#ffffff", size=14),
|
| 965 |
+
textinfo="percent+label",
|
| 966 |
+
marker=dict(line=dict(color="#1a1a2e", width=3)),
|
| 967 |
+
)
|
| 968 |
+
fig3a.update_layout(
|
| 969 |
+
paper_bgcolor="rgba(26, 26, 46, 0.95)",
|
| 970 |
+
plot_bgcolor="rgba(22, 33, 62, 0.95)",
|
| 971 |
+
font=dict(family="Inter, sans-serif", size=13, color="#e0e0e0"),
|
| 972 |
+
title=dict(text="Mapped vs Novel", font=dict(size=16, color="#ffffff")),
|
| 973 |
+
legend=dict(font=dict(color="#e0e0e0")),
|
| 974 |
+
height=380,
|
| 975 |
+
margin=dict(l=20, r=20, t=50, b=20),
|
| 976 |
+
)
|
| 977 |
+
st.plotly_chart(fig3a, use_container_width=True)
|
| 978 |
+
|
| 979 |
+
with chart3_col2:
|
| 980 |
+
fig3b = px.bar(
|
| 981 |
+
tax_data,
|
| 982 |
+
x="Classification",
|
| 983 |
+
y="Count",
|
| 984 |
+
color="Classification",
|
| 985 |
+
color_discrete_map={
|
| 986 |
+
"MAPPED": "#38ef7d",
|
| 987 |
+
"NOVEL": "#f7b733",
|
| 988 |
+
},
|
| 989 |
+
text="Count",
|
| 990 |
+
)
|
| 991 |
+
fig3b.update_traces(
|
| 992 |
+
textposition="outside",
|
| 993 |
+
textfont=dict(color="#e0e0e0", size=16, family="Inter"),
|
| 994 |
+
marker=dict(line=dict(width=0)),
|
| 995 |
+
)
|
| 996 |
+
fig3b.update_layout(
|
| 997 |
+
**PLOTLY_LAYOUT,
|
| 998 |
+
title="Classification Count",
|
| 999 |
+
xaxis_title="",
|
| 1000 |
+
yaxis_title="Number of Topics",
|
| 1001 |
+
showlegend=False,
|
| 1002 |
+
height=380,
|
| 1003 |
+
)
|
| 1004 |
+
st.plotly_chart(fig3b, use_container_width=True)
|
| 1005 |
+
|
| 1006 |
+
st.markdown("---")
|
| 1007 |
+
|
| 1008 |
+
# -----------------------------------------------------------
|
| 1009 |
+
# Chart 4: Alignment Distribution (from comparisons)
|
| 1010 |
+
# -----------------------------------------------------------
|
| 1011 |
+
if not result.comparison.empty:
|
| 1012 |
+
st.markdown("#### ⚖️ Theme Alignment Distribution")
|
| 1013 |
+
st.caption("Distribution of alignment strength between Title and Abstract topics.")
|
| 1014 |
+
|
| 1015 |
+
alignment_data = result.comparison["alignment"].value_counts().reset_index()
|
| 1016 |
+
alignment_data.columns = ["Alignment", "Count"]
|
| 1017 |
+
|
| 1018 |
+
# Define order and colors
|
| 1019 |
+
align_order = ["Strong", "Moderate", "Weak", "No Match"]
|
| 1020 |
+
align_colors = {
|
| 1021 |
+
"Strong": "#38ef7d",
|
| 1022 |
+
"Moderate": "#4facfe",
|
| 1023 |
+
"Weak": "#f7971e",
|
| 1024 |
+
"No Match": "#f5576c",
|
| 1025 |
+
}
|
| 1026 |
+
|
| 1027 |
+
fig4 = px.bar(
|
| 1028 |
+
alignment_data,
|
| 1029 |
+
x="Alignment",
|
| 1030 |
+
y="Count",
|
| 1031 |
+
color="Alignment",
|
| 1032 |
+
color_discrete_map=align_colors,
|
| 1033 |
+
text="Count",
|
| 1034 |
+
category_orders={"Alignment": align_order},
|
| 1035 |
+
)
|
| 1036 |
+
fig4.update_traces(
|
| 1037 |
+
textposition="outside",
|
| 1038 |
+
textfont=dict(color="#e0e0e0", size=14, family="Inter"),
|
| 1039 |
+
marker=dict(line=dict(width=0)),
|
| 1040 |
+
)
|
| 1041 |
+
fig4.update_layout(
|
| 1042 |
+
**PLOTLY_LAYOUT,
|
| 1043 |
+
title="Title ↔ Abstract Alignment Distribution",
|
| 1044 |
+
xaxis_title="Alignment Level",
|
| 1045 |
+
yaxis_title="Number of Topic Pairs",
|
| 1046 |
+
showlegend=False,
|
| 1047 |
+
height=420,
|
| 1048 |
+
)
|
| 1049 |
+
st.plotly_chart(fig4, use_container_width=True)
|
| 1050 |
+
|
| 1051 |
+
st.markdown("---")
|
| 1052 |
+
|
| 1053 |
+
# -----------------------------------------------------------
|
| 1054 |
+
# Chart 5: Similarity Score Histogram
|
| 1055 |
+
# -----------------------------------------------------------
|
| 1056 |
+
st.markdown("#### 📐 Similarity Score Distribution")
|
| 1057 |
+
st.caption("Distribution of Jaccard similarity scores between matched Title and Abstract topics.")
|
| 1058 |
+
|
| 1059 |
+
fig5 = px.histogram(
|
| 1060 |
+
result.comparison,
|
| 1061 |
+
x="similarity",
|
| 1062 |
+
nbins=30,
|
| 1063 |
+
color_discrete_sequence=["#667eea"],
|
| 1064 |
+
marginal="box",
|
| 1065 |
+
)
|
| 1066 |
+
fig5.update_traces(
|
| 1067 |
+
marker=dict(
|
| 1068 |
+
line=dict(width=1, color="#b8b5ff"),
|
| 1069 |
+
),
|
| 1070 |
+
selector=dict(type="histogram"),
|
| 1071 |
+
)
|
| 1072 |
+
fig5.update_layout(
|
| 1073 |
+
**PLOTLY_LAYOUT,
|
| 1074 |
+
title="Similarity Score Histogram",
|
| 1075 |
+
xaxis_title="Jaccard Similarity Score",
|
| 1076 |
+
yaxis_title="Count",
|
| 1077 |
+
height=420,
|
| 1078 |
+
bargap=0.05,
|
| 1079 |
+
)
|
| 1080 |
+
st.plotly_chart(fig5, use_container_width=True)
|
| 1081 |
+
|
| 1082 |
+
# ---- Downloads Section ----
|
| 1083 |
+
st.markdown('<div class="section-header">📥 Download Outputs</div>', unsafe_allow_html=True)
|
| 1084 |
+
|
| 1085 |
+
dcol1, dcol2, dcol3, dcol4 = st.columns(4)
|
| 1086 |
+
|
| 1087 |
+
with dcol1:
|
| 1088 |
+
csv_data = result.combined_topics.to_csv(index=False)
|
| 1089 |
+
st.download_button(
|
| 1090 |
+
"⬇️ Topics Table (CSV)",
|
| 1091 |
+
data=csv_data,
|
| 1092 |
+
file_name="topics_table.csv",
|
| 1093 |
+
mime="text/csv",
|
| 1094 |
+
use_container_width=True,
|
| 1095 |
+
)
|
| 1096 |
+
|
| 1097 |
+
with dcol2:
|
| 1098 |
+
comp_data = result.comparison.to_csv(index=False)
|
| 1099 |
+
st.download_button(
|
| 1100 |
+
"⬇️ Comparison (CSV)",
|
| 1101 |
+
data=comp_data,
|
| 1102 |
+
file_name="comparison.csv",
|
| 1103 |
+
mime="text/csv",
|
| 1104 |
+
use_container_width=True,
|
| 1105 |
+
)
|
| 1106 |
+
|
| 1107 |
+
with dcol3:
|
| 1108 |
+
json_data = json.dumps(result.taxonomy_map, indent=2, ensure_ascii=False)
|
| 1109 |
+
st.download_button(
|
| 1110 |
+
"⬇️ Taxonomy Map (JSON)",
|
| 1111 |
+
data=json_data,
|
| 1112 |
+
file_name="taxonomy_map.json",
|
| 1113 |
+
mime="application/json",
|
| 1114 |
+
use_container_width=True,
|
| 1115 |
+
)
|
| 1116 |
+
|
| 1117 |
+
with dcol4:
|
| 1118 |
+
# Download review table if it exists
|
| 1119 |
+
review_path = os.path.join(OUTPUT_DIR, "review_table.csv")
|
| 1120 |
+
if os.path.exists(review_path):
|
| 1121 |
+
with open(review_path, "r") as f:
|
| 1122 |
+
review_data = f.read()
|
| 1123 |
+
st.download_button(
|
| 1124 |
+
"⬇️ Review Table (CSV)",
|
| 1125 |
+
data=review_data,
|
| 1126 |
+
file_name="review_table.csv",
|
| 1127 |
+
mime="text/csv",
|
| 1128 |
+
use_container_width=True,
|
| 1129 |
+
)
|
| 1130 |
+
else:
|
| 1131 |
+
st.download_button(
|
| 1132 |
+
"⬇️ Review Table (CSV)",
|
| 1133 |
+
data="Not saved yet. Go to Review Table tab and click Save.",
|
| 1134 |
+
file_name="review_table.csv",
|
| 1135 |
+
mime="text/csv",
|
| 1136 |
+
use_container_width=True,
|
| 1137 |
+
disabled=True,
|
| 1138 |
+
)
|
| 1139 |
+
|
| 1140 |
+
# ---- Auto-save comparison.csv and taxonomy_map.json to outputs ----
|
| 1141 |
+
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 1142 |
+
result.comparison.to_csv(os.path.join(OUTPUT_DIR, "comparison.csv"), index=False)
|
| 1143 |
+
with open(os.path.join(OUTPUT_DIR, "taxonomy_map.json"), "w", encoding="utf-8") as f:
|
| 1144 |
+
json.dump(result.taxonomy_map, f, indent=2, ensure_ascii=False)
|
| 1145 |
+
|
| 1146 |
+
# ---- Pipeline Log ----
|
| 1147 |
+
with st.expander("📜 Pipeline Execution Log"):
|
| 1148 |
+
st.markdown(f"**Status:** `{result.status}`")
|
| 1149 |
+
st.markdown(f"**Steps Completed:** {len(result.steps_completed)}/9")
|
| 1150 |
+
for i, step in enumerate(result.steps_completed, 1):
|
| 1151 |
+
st.markdown(f" ✅ Step {i}: `{step}`")
|
| 1152 |
+
if result.errors:
|
| 1153 |
+
st.markdown("**Errors:**")
|
| 1154 |
+
for err in result.errors:
|
| 1155 |
+
st.error(err)
|
| 1156 |
+
st.markdown("**Exported Files:**")
|
| 1157 |
+
for name, path in result.exported_files.items():
|
| 1158 |
+
st.markdown(f" 📄 `{name}` → `{path}`")
|
| 1159 |
+
|
| 1160 |
+
else:
|
| 1161 |
+
# ---- Welcome / instructions when no results ----
|
| 1162 |
+
st.markdown("""
|
| 1163 |
+
<div class="info-box">
|
| 1164 |
+
<strong>👋 Welcome!</strong><br><br>
|
| 1165 |
+
This application uses an AI agent to perform comprehensive topic modeling on research papers.
|
| 1166 |
+
<br><br>
|
| 1167 |
+
<strong>How to use:</strong><br>
|
| 1168 |
+
1️⃣ Upload a CSV file with <code>Title</code> and <code>Abstract</code> columns (or use the default dataset).<br>
|
| 1169 |
+
2️⃣ Configure the minimum number of topics and label generation method in the sidebar.<br>
|
| 1170 |
+
3️⃣ Click <strong>"🚀 Run Topic Modeling Agent"</strong> to start the analysis.<br>
|
| 1171 |
+
4️⃣ Explore topics, comparisons, and taxonomy classification in the results tabs.<br>
|
| 1172 |
+
5️⃣ Review and annotate topics in the <strong>✏️ Review Table</strong> tab.<br>
|
| 1173 |
+
6️⃣ View interactive charts in the <strong>📈 Charts</strong> tab.<br>
|
| 1174 |
+
7️⃣ Download all outputs as CSV and JSON files.
|
| 1175 |
+
</div>
|
| 1176 |
+
""", unsafe_allow_html=True)
|
| 1177 |
+
|
| 1178 |
+
st.markdown("<br>", unsafe_allow_html=True)
|
| 1179 |
+
|
| 1180 |
+
# Show a preview if default dataset exists
|
| 1181 |
+
if os.path.exists("dataset.csv"):
|
| 1182 |
+
with st.expander("👀 Preview Default Dataset", expanded=False):
|
| 1183 |
+
try:
|
| 1184 |
+
preview_df = pd.read_csv("dataset.csv", nrows=10)
|
| 1185 |
+
st.markdown(f"**Columns:** {', '.join(preview_df.columns.tolist())}")
|
| 1186 |
+
if "Title" in preview_df.columns:
|
| 1187 |
+
st.dataframe(preview_df[["Title", "Abstract"]].head(10) if "Abstract" in preview_df.columns else preview_df[["Title"]].head(10), use_container_width=True)
|
| 1188 |
+
else:
|
| 1189 |
+
st.dataframe(preview_df.head(10), use_container_width=True)
|
| 1190 |
+
except Exception as e:
|
| 1191 |
+
st.warning(f"Could not preview dataset: {e}")
|
dataset.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a602dfcb3982c58156c67f4fb2565cc8ec9b4b2368a1b6ad4be3c621c1232218
|
| 3 |
+
size 28342399
|
requirements.txt
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Requirements for the AI Topic Modeling Agent
|
| 2 |
+
# 13 packages as specified
|
| 3 |
+
|
| 4 |
+
streamlit>=1.30.0
|
| 5 |
+
pandas>=2.0.0
|
| 6 |
+
numpy>=1.24.0
|
| 7 |
+
scikit-learn>=1.3.0
|
| 8 |
+
nltk>=3.8.0
|
| 9 |
+
bertopic>=0.16.0
|
| 10 |
+
umap-learn>=0.5.4
|
| 11 |
+
hdbscan>=0.8.33
|
| 12 |
+
sentence-transformers>=2.2.0
|
| 13 |
+
groq>=0.4.0
|
| 14 |
+
plotly>=5.18.0
|
| 15 |
+
scipy>=1.11.0
|
| 16 |
+
joblib>=1.3.0
|
tools.py
ADDED
|
@@ -0,0 +1,626 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
tools.py — Core functions for the AI-driven topic modeling pipeline.
|
| 3 |
+
|
| 4 |
+
This module provides all analytical functions used by the TopicAgent:
|
| 5 |
+
- CSV ingestion and validation
|
| 6 |
+
- Text preprocessing (lowercasing, stopword removal, cleaning)
|
| 7 |
+
- Topic modeling via BERTopic (with fallback to sklearn LDA)
|
| 8 |
+
- Automatic human-readable label generation
|
| 9 |
+
- Cross-source theme comparison (Title vs Abstract)
|
| 10 |
+
- Taxonomy mapping (MAPPED / NOVEL classification)
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import re
|
| 14 |
+
import json
|
| 15 |
+
import logging
|
| 16 |
+
from typing import Dict, List, Tuple, Optional, Any
|
| 17 |
+
|
| 18 |
+
import numpy as np
|
| 19 |
+
import pandas as pd
|
| 20 |
+
import nltk
|
| 21 |
+
from nltk.corpus import stopwords
|
| 22 |
+
from nltk.tokenize import word_tokenize
|
| 23 |
+
|
| 24 |
+
# ---------------------------------------------------------------------------
|
| 25 |
+
# Logging
|
| 26 |
+
# ---------------------------------------------------------------------------
|
| 27 |
+
logger = logging.getLogger(__name__)
|
| 28 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
| 29 |
+
|
| 30 |
+
# ---------------------------------------------------------------------------
|
| 31 |
+
# NLTK data download (idempotent)
|
| 32 |
+
# ---------------------------------------------------------------------------
|
| 33 |
+
for _resource in ("punkt", "punkt_tab", "stopwords"):
|
| 34 |
+
try:
|
| 35 |
+
nltk.data.find(f"tokenizers/{_resource}" if "punkt" in _resource else f"corpora/{_resource}")
|
| 36 |
+
except LookupError:
|
| 37 |
+
nltk.download(_resource, quiet=True)
|
| 38 |
+
|
| 39 |
+
# ---------------------------------------------------------------------------
|
| 40 |
+
# Reference taxonomy of known AI / business / research themes
|
| 41 |
+
# Used by create_taxonomy_map() for MAPPED vs NOVEL classification
|
| 42 |
+
# ---------------------------------------------------------------------------
|
| 43 |
+
KNOWN_THEMES: List[str] = [
|
| 44 |
+
# AI / ML
|
| 45 |
+
"artificial intelligence", "machine learning", "deep learning", "neural network",
|
| 46 |
+
"natural language processing", "computer vision", "reinforcement learning",
|
| 47 |
+
"generative ai", "large language model", "transformer", "chatbot",
|
| 48 |
+
"recommendation system", "knowledge graph", "robotics", "autonomous",
|
| 49 |
+
"explainable ai", "federated learning", "transfer learning", "ai ethics",
|
| 50 |
+
"adversarial", "gan", "diffusion model", "prompt engineering",
|
| 51 |
+
# Data science
|
| 52 |
+
"data mining", "big data", "analytics", "data science", "data quality",
|
| 53 |
+
"feature engineering", "dimensionality reduction", "clustering", "classification",
|
| 54 |
+
"regression", "time series", "anomaly detection", "sentiment analysis",
|
| 55 |
+
# Business / Management
|
| 56 |
+
"digital transformation", "innovation", "strategy", "supply chain",
|
| 57 |
+
"customer experience", "marketing", "e-commerce", "fintech", "blockchain",
|
| 58 |
+
"sustainability", "corporate social responsibility", "knowledge management",
|
| 59 |
+
"decision support", "business intelligence", "enterprise", "organizational",
|
| 60 |
+
"human resource", "leadership", "entrepreneurship", "business model",
|
| 61 |
+
# Information systems
|
| 62 |
+
"information systems", "technology adoption", "user acceptance", "privacy",
|
| 63 |
+
"security", "trust", "social media", "online community", "platform",
|
| 64 |
+
"crowdsourcing", "cloud computing", "iot", "internet of things",
|
| 65 |
+
"software engineering", "agile", "devops", "digital platform",
|
| 66 |
+
# Healthcare / Society
|
| 67 |
+
"healthcare", "telemedicine", "electronic health", "public health",
|
| 68 |
+
"education", "e-learning", "smart city", "government", "policy",
|
| 69 |
+
"ethics", "fairness", "bias", "misinformation", "content moderation",
|
| 70 |
+
# Research methods
|
| 71 |
+
"survey", "experiment", "case study", "meta-analysis", "bibliometric",
|
| 72 |
+
"systematic review", "structural equation", "grounded theory",
|
| 73 |
+
]
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
# ===================================================================
|
| 77 |
+
# 1. load_csv — Ingest and validate the CSV dataset
|
| 78 |
+
# ===================================================================
|
| 79 |
+
def load_csv(filepath: str) -> pd.DataFrame:
|
| 80 |
+
"""
|
| 81 |
+
Load a CSV file and ensure the required columns (Title, Abstract) exist.
|
| 82 |
+
|
| 83 |
+
Parameters
|
| 84 |
+
----------
|
| 85 |
+
filepath : str
|
| 86 |
+
Path to the CSV file.
|
| 87 |
+
|
| 88 |
+
Returns
|
| 89 |
+
-------
|
| 90 |
+
pd.DataFrame
|
| 91 |
+
DataFrame with at least 'Title' and 'Abstract' columns.
|
| 92 |
+
|
| 93 |
+
Raises
|
| 94 |
+
------
|
| 95 |
+
FileNotFoundError
|
| 96 |
+
If the specified file does not exist.
|
| 97 |
+
ValueError
|
| 98 |
+
If required columns are missing.
|
| 99 |
+
"""
|
| 100 |
+
logger.info("Loading CSV from %s", filepath)
|
| 101 |
+
df = pd.read_csv(filepath, encoding="utf-8", on_bad_lines="skip")
|
| 102 |
+
logger.info("Loaded %d rows × %d columns", len(df), len(df.columns))
|
| 103 |
+
|
| 104 |
+
# Validate required columns (case-insensitive match)
|
| 105 |
+
col_map = {c.strip().lower(): c for c in df.columns}
|
| 106 |
+
required = {"title", "abstract"}
|
| 107 |
+
missing = required - set(col_map.keys())
|
| 108 |
+
if missing:
|
| 109 |
+
raise ValueError(f"CSV is missing required columns: {missing}. Found: {list(df.columns)}")
|
| 110 |
+
|
| 111 |
+
# Rename to canonical form
|
| 112 |
+
df = df.rename(columns={col_map["title"]: "Title", col_map["abstract"]: "Abstract"})
|
| 113 |
+
|
| 114 |
+
# Drop rows where both Title and Abstract are empty
|
| 115 |
+
df = df.dropna(subset=["Title", "Abstract"], how="all").reset_index(drop=True)
|
| 116 |
+
df["Title"] = df["Title"].fillna("")
|
| 117 |
+
df["Abstract"] = df["Abstract"].fillna("")
|
| 118 |
+
|
| 119 |
+
logger.info("After cleaning: %d usable rows", len(df))
|
| 120 |
+
return df
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
# ===================================================================
|
| 124 |
+
# 2. preprocess_text — Clean and normalise a list of text documents
|
| 125 |
+
# ===================================================================
|
| 126 |
+
def preprocess_text(documents: List[str]) -> List[str]:
|
| 127 |
+
"""
|
| 128 |
+
Apply professional-grade text preprocessing:
|
| 129 |
+
1. Lowercase
|
| 130 |
+
2. Remove URLs, emails, special characters, digits
|
| 131 |
+
3. Tokenize
|
| 132 |
+
4. Remove stopwords (NLTK English)
|
| 133 |
+
5. Remove very short tokens (length ≤ 2)
|
| 134 |
+
6. Rejoin into cleaned strings
|
| 135 |
+
|
| 136 |
+
Parameters
|
| 137 |
+
----------
|
| 138 |
+
documents : list of str
|
| 139 |
+
Raw text documents.
|
| 140 |
+
|
| 141 |
+
Returns
|
| 142 |
+
-------
|
| 143 |
+
list of str
|
| 144 |
+
Cleaned text documents.
|
| 145 |
+
"""
|
| 146 |
+
stop_words = set(stopwords.words("english"))
|
| 147 |
+
# Extended stopwords common in academic abstracts
|
| 148 |
+
stop_words.update([
|
| 149 |
+
"©", "elsevier", "rights", "reserved", "doi", "http", "https",
|
| 150 |
+
"vol", "pp", "fig", "table", "journal", "author", "authors",
|
| 151 |
+
"study", "paper", "research", "results", "findings", "however",
|
| 152 |
+
"propose", "proposed", "approach", "using", "based", "also",
|
| 153 |
+
"show", "shows", "shown", "may", "used", "use", "one", "two",
|
| 154 |
+
"three", "new", "well", "within", "among", "across", "toward",
|
| 155 |
+
"towards", "et", "al", "ie", "eg", "cf", "thus", "therefore",
|
| 156 |
+
"moreover", "furthermore", "addition", "conclusion", "conclusions",
|
| 157 |
+
])
|
| 158 |
+
|
| 159 |
+
cleaned: List[str] = []
|
| 160 |
+
for doc in documents:
|
| 161 |
+
if not isinstance(doc, str) or not doc.strip():
|
| 162 |
+
cleaned.append("")
|
| 163 |
+
continue
|
| 164 |
+
|
| 165 |
+
text = doc.lower()
|
| 166 |
+
# Remove URLs
|
| 167 |
+
text = re.sub(r"https?://\S+|www\.\S+", " ", text)
|
| 168 |
+
# Remove emails
|
| 169 |
+
text = re.sub(r"\S+@\S+", " ", text)
|
| 170 |
+
# Remove digits and special characters but keep spaces
|
| 171 |
+
text = re.sub(r"[^a-z\s]", " ", text)
|
| 172 |
+
# Collapse whitespace
|
| 173 |
+
text = re.sub(r"\s+", " ", text).strip()
|
| 174 |
+
|
| 175 |
+
# Tokenize and filter
|
| 176 |
+
tokens = word_tokenize(text)
|
| 177 |
+
tokens = [t for t in tokens if t not in stop_words and len(t) > 2]
|
| 178 |
+
|
| 179 |
+
cleaned.append(" ".join(tokens))
|
| 180 |
+
|
| 181 |
+
logger.info("Preprocessed %d documents", len(cleaned))
|
| 182 |
+
return cleaned
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
# ===================================================================
|
| 186 |
+
# 3. run_topic_modeling — Discover topics via BERTopic (or LDA fallback)
|
| 187 |
+
# ===================================================================
|
| 188 |
+
def run_topic_modeling(
|
| 189 |
+
documents: List[str],
|
| 190 |
+
source_label: str = "documents",
|
| 191 |
+
min_topics: int = 100,
|
| 192 |
+
use_bertopic: bool = True,
|
| 193 |
+
) -> Tuple[pd.DataFrame, Any]:
|
| 194 |
+
"""
|
| 195 |
+
Perform topic modeling on a corpus of preprocessed documents.
|
| 196 |
+
|
| 197 |
+
Strategy:
|
| 198 |
+
1. Try BERTopic with UMAP + HDBSCAN. If the result has < min_topics,
|
| 199 |
+
automatically fall back to sklearn LDA.
|
| 200 |
+
2. LDA is configured with n_components = min_topics to guarantee the
|
| 201 |
+
requested topic count.
|
| 202 |
+
|
| 203 |
+
Parameters
|
| 204 |
+
----------
|
| 205 |
+
documents : list of str
|
| 206 |
+
Preprocessed text documents.
|
| 207 |
+
source_label : str
|
| 208 |
+
Label for logging (e.g. "Titles" or "Abstracts").
|
| 209 |
+
min_topics : int
|
| 210 |
+
Minimum number of topics required (default 100).
|
| 211 |
+
use_bertopic : bool
|
| 212 |
+
Whether to attempt BERTopic first.
|
| 213 |
+
|
| 214 |
+
Returns
|
| 215 |
+
-------
|
| 216 |
+
topics_df : pd.DataFrame
|
| 217 |
+
Columns: topic_id, keywords (comma-separated), representative_docs
|
| 218 |
+
model : object
|
| 219 |
+
The fitted topic model for downstream inspection.
|
| 220 |
+
"""
|
| 221 |
+
# Filter out empty documents
|
| 222 |
+
valid_docs = [d for d in documents if d.strip()]
|
| 223 |
+
if len(valid_docs) < 20:
|
| 224 |
+
raise ValueError(f"Not enough valid documents ({len(valid_docs)}) for topic modeling.")
|
| 225 |
+
|
| 226 |
+
logger.info("Running topic modeling on %d %s (target ≥ %d topics)", len(valid_docs), source_label, min_topics)
|
| 227 |
+
|
| 228 |
+
topics_df = None
|
| 229 |
+
model = None
|
| 230 |
+
|
| 231 |
+
# ------ Attempt BERTopic ------
|
| 232 |
+
if use_bertopic:
|
| 233 |
+
try:
|
| 234 |
+
topics_df, model = _run_bertopic(valid_docs, source_label, min_topics)
|
| 235 |
+
except Exception as exc:
|
| 236 |
+
logger.warning("BERTopic failed (%s). Falling back to LDA.", exc)
|
| 237 |
+
topics_df = None
|
| 238 |
+
|
| 239 |
+
# ------ Fallback to LDA if needed ------
|
| 240 |
+
if topics_df is None or len(topics_df) < min_topics:
|
| 241 |
+
logger.info("Using LDA to guarantee ≥ %d topics for %s", min_topics, source_label)
|
| 242 |
+
topics_df, model = _run_lda(valid_docs, source_label, min_topics)
|
| 243 |
+
|
| 244 |
+
logger.info("Topic modeling complete for %s: %d topics discovered", source_label, len(topics_df))
|
| 245 |
+
return topics_df, model
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
def _run_bertopic(docs: List[str], source_label: str, min_topics: int):
|
| 249 |
+
"""Run BERTopic with tuned parameters."""
|
| 250 |
+
from bertopic import BERTopic
|
| 251 |
+
from umap import UMAP
|
| 252 |
+
from hdbscan import HDBSCAN
|
| 253 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
| 254 |
+
|
| 255 |
+
umap_model = UMAP(
|
| 256 |
+
n_neighbors=10,
|
| 257 |
+
n_components=5,
|
| 258 |
+
min_dist=0.0,
|
| 259 |
+
metric="cosine",
|
| 260 |
+
random_state=42,
|
| 261 |
+
)
|
| 262 |
+
hdbscan_model = HDBSCAN(
|
| 263 |
+
min_cluster_size=5,
|
| 264 |
+
min_samples=2,
|
| 265 |
+
prediction_data=True,
|
| 266 |
+
)
|
| 267 |
+
vectorizer = CountVectorizer(
|
| 268 |
+
stop_words="english",
|
| 269 |
+
ngram_range=(1, 2),
|
| 270 |
+
max_df=0.90,
|
| 271 |
+
min_df=2,
|
| 272 |
+
)
|
| 273 |
+
|
| 274 |
+
topic_model = BERTopic(
|
| 275 |
+
umap_model=umap_model,
|
| 276 |
+
hdbscan_model=hdbscan_model,
|
| 277 |
+
vectorizer_model=vectorizer,
|
| 278 |
+
nr_topics="auto",
|
| 279 |
+
top_n_words=10,
|
| 280 |
+
verbose=False,
|
| 281 |
+
)
|
| 282 |
+
|
| 283 |
+
topics, _probs = topic_model.fit_transform(docs)
|
| 284 |
+
info = topic_model.get_topic_info()
|
| 285 |
+
# Exclude outlier topic (-1)
|
| 286 |
+
info = info[info["Topic"] != -1].reset_index(drop=True)
|
| 287 |
+
|
| 288 |
+
rows = []
|
| 289 |
+
for _, row in info.iterrows():
|
| 290 |
+
tid = int(row["Topic"])
|
| 291 |
+
topic_words = topic_model.get_topic(tid)
|
| 292 |
+
kw = ", ".join([w for w, _ in topic_words[:10]])
|
| 293 |
+
rows.append({"topic_id": tid, "keywords": kw, "source": source_label})
|
| 294 |
+
|
| 295 |
+
df = pd.DataFrame(rows)
|
| 296 |
+
return df, topic_model
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
def _run_lda(docs: List[str], source_label: str, n_topics: int):
|
| 300 |
+
"""Run sklearn LDA to guarantee the requested number of topics."""
|
| 301 |
+
from sklearn.decomposition import LatentDirichletAllocation
|
| 302 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
| 303 |
+
|
| 304 |
+
vectorizer = CountVectorizer(
|
| 305 |
+
stop_words="english",
|
| 306 |
+
max_df=0.90,
|
| 307 |
+
min_df=2,
|
| 308 |
+
ngram_range=(1, 2),
|
| 309 |
+
max_features=10000,
|
| 310 |
+
)
|
| 311 |
+
dtm = vectorizer.fit_transform(docs)
|
| 312 |
+
feature_names = vectorizer.get_feature_names_out()
|
| 313 |
+
|
| 314 |
+
lda = LatentDirichletAllocation(
|
| 315 |
+
n_components=n_topics,
|
| 316 |
+
max_iter=25,
|
| 317 |
+
learning_method="online",
|
| 318 |
+
random_state=42,
|
| 319 |
+
n_jobs=-1,
|
| 320 |
+
)
|
| 321 |
+
lda.fit(dtm)
|
| 322 |
+
|
| 323 |
+
rows = []
|
| 324 |
+
for idx, component in enumerate(lda.components_):
|
| 325 |
+
top_indices = component.argsort()[-10:][::-1]
|
| 326 |
+
kw = ", ".join([feature_names[i] for i in top_indices])
|
| 327 |
+
rows.append({"topic_id": idx, "keywords": kw, "source": source_label})
|
| 328 |
+
|
| 329 |
+
df = pd.DataFrame(rows)
|
| 330 |
+
return df, lda
|
| 331 |
+
|
| 332 |
+
|
| 333 |
+
# ===================================================================
|
| 334 |
+
# 4. generate_labels — Create human-readable labels for each topic
|
| 335 |
+
# ===================================================================
|
| 336 |
+
def generate_labels(
|
| 337 |
+
topics_df: pd.DataFrame,
|
| 338 |
+
use_llm: bool = False,
|
| 339 |
+
groq_api_key: Optional[str] = None,
|
| 340 |
+
) -> pd.DataFrame:
|
| 341 |
+
"""
|
| 342 |
+
Generate a short human-readable label for every topic.
|
| 343 |
+
|
| 344 |
+
Strategy:
|
| 345 |
+
- If use_llm=True and a Groq API key is provided, use the Groq LLM
|
| 346 |
+
(llama-3.3-70b-versatile, free tier) to produce contextual labels.
|
| 347 |
+
- Otherwise, apply a heuristic: capitalise the first 3–4 keywords.
|
| 348 |
+
|
| 349 |
+
Parameters
|
| 350 |
+
----------
|
| 351 |
+
topics_df : pd.DataFrame
|
| 352 |
+
Must contain columns 'topic_id' and 'keywords'.
|
| 353 |
+
use_llm : bool
|
| 354 |
+
Whether to use the Groq LLM for label generation.
|
| 355 |
+
groq_api_key : str, optional
|
| 356 |
+
Groq API key, required if use_llm is True.
|
| 357 |
+
|
| 358 |
+
Returns
|
| 359 |
+
-------
|
| 360 |
+
pd.DataFrame
|
| 361 |
+
Same DataFrame with an additional 'label' column.
|
| 362 |
+
"""
|
| 363 |
+
if use_llm and groq_api_key:
|
| 364 |
+
logger.info("Generating labels using Groq LLM …")
|
| 365 |
+
topics_df = _generate_labels_llm(topics_df, groq_api_key)
|
| 366 |
+
else:
|
| 367 |
+
logger.info("Generating labels using keyword heuristic …")
|
| 368 |
+
topics_df = _generate_labels_heuristic(topics_df)
|
| 369 |
+
|
| 370 |
+
return topics_df
|
| 371 |
+
|
| 372 |
+
|
| 373 |
+
def _generate_labels_heuristic(df: pd.DataFrame) -> pd.DataFrame:
|
| 374 |
+
"""Create labels from the top keywords of each topic."""
|
| 375 |
+
labels = []
|
| 376 |
+
for _, row in df.iterrows():
|
| 377 |
+
kws = [kw.strip() for kw in row["keywords"].split(",")]
|
| 378 |
+
# Take the first 3-4 non-trivial keywords and title-case them
|
| 379 |
+
candidates = [kw.title() for kw in kws if len(kw) > 2][:4]
|
| 380 |
+
label = " / ".join(candidates) if candidates else f"Topic {row['topic_id']}"
|
| 381 |
+
labels.append(label)
|
| 382 |
+
df = df.copy()
|
| 383 |
+
df["label"] = labels
|
| 384 |
+
return df
|
| 385 |
+
|
| 386 |
+
|
| 387 |
+
def _generate_labels_llm(df: pd.DataFrame, api_key: str) -> pd.DataFrame:
|
| 388 |
+
"""Use Groq API to generate contextual labels for topics (batched)."""
|
| 389 |
+
import time
|
| 390 |
+
try:
|
| 391 |
+
from groq import Groq
|
| 392 |
+
except ImportError:
|
| 393 |
+
logger.warning("groq package not installed. Falling back to heuristic labels.")
|
| 394 |
+
return _generate_labels_heuristic(df)
|
| 395 |
+
|
| 396 |
+
client = Groq(api_key=api_key)
|
| 397 |
+
labels = []
|
| 398 |
+
|
| 399 |
+
# Process in batches to avoid rate limits
|
| 400 |
+
batch_size = 10
|
| 401 |
+
for batch_start in range(0, len(df), batch_size):
|
| 402 |
+
batch = df.iloc[batch_start:batch_start + batch_size]
|
| 403 |
+
prompt_lines = []
|
| 404 |
+
for _, row in batch.iterrows():
|
| 405 |
+
prompt_lines.append(f"Topic {row['topic_id']}: keywords = [{row['keywords']}]")
|
| 406 |
+
|
| 407 |
+
prompt = (
|
| 408 |
+
"You are a research taxonomy expert. For each topic below, "
|
| 409 |
+
"generate a concise, descriptive label (3-6 words) that captures "
|
| 410 |
+
"the theme of the keywords. Return ONLY a JSON list of objects "
|
| 411 |
+
'with keys "topic_id" and "label". No extra text.\n\n'
|
| 412 |
+
+ "\n".join(prompt_lines)
|
| 413 |
+
)
|
| 414 |
+
|
| 415 |
+
try:
|
| 416 |
+
chat = client.chat.completions.create(
|
| 417 |
+
model="llama-3.3-70b-versatile",
|
| 418 |
+
messages=[{"role": "user", "content": prompt}],
|
| 419 |
+
temperature=0.3,
|
| 420 |
+
max_tokens=1024,
|
| 421 |
+
)
|
| 422 |
+
resp = chat.choices[0].message.content.strip()
|
| 423 |
+
# Parse JSON from the response
|
| 424 |
+
# Find JSON array in response
|
| 425 |
+
json_match = re.search(r"\[.*\]", resp, re.DOTALL)
|
| 426 |
+
if json_match:
|
| 427 |
+
batch_labels = json.loads(json_match.group())
|
| 428 |
+
label_map = {item["topic_id"]: item["label"] for item in batch_labels}
|
| 429 |
+
for _, row in batch.iterrows():
|
| 430 |
+
labels.append(label_map.get(row["topic_id"], f"Topic {row['topic_id']}"))
|
| 431 |
+
else:
|
| 432 |
+
# Fallback for this batch
|
| 433 |
+
for _, row in batch.iterrows():
|
| 434 |
+
kws = [kw.strip().title() for kw in row["keywords"].split(",")][:4]
|
| 435 |
+
labels.append(" / ".join(kws))
|
| 436 |
+
except Exception as exc:
|
| 437 |
+
logger.warning("Groq API error for batch starting at %d: %s", batch_start, exc)
|
| 438 |
+
for _, row in batch.iterrows():
|
| 439 |
+
kws = [kw.strip().title() for kw in row["keywords"].split(",")][:4]
|
| 440 |
+
labels.append(" / ".join(kws))
|
| 441 |
+
|
| 442 |
+
# Rate-limit courtesy delay
|
| 443 |
+
time.sleep(0.5)
|
| 444 |
+
|
| 445 |
+
df = df.copy()
|
| 446 |
+
df["label"] = labels
|
| 447 |
+
return df
|
| 448 |
+
|
| 449 |
+
|
| 450 |
+
# ===================================================================
|
| 451 |
+
# 5. compare_themes — Cross-compare title vs abstract topics
|
| 452 |
+
# ===================================================================
|
| 453 |
+
def compare_themes(
|
| 454 |
+
title_topics: pd.DataFrame,
|
| 455 |
+
abstract_topics: pd.DataFrame,
|
| 456 |
+
) -> pd.DataFrame:
|
| 457 |
+
"""
|
| 458 |
+
Build a comparison table showing dominant themes from titles and
|
| 459 |
+
abstracts side-by-side.
|
| 460 |
+
|
| 461 |
+
Matching strategy:
|
| 462 |
+
- Compute keyword overlap (Jaccard similarity) between every
|
| 463 |
+
title-topic and abstract-topic pair.
|
| 464 |
+
- For each title-topic, find the best matching abstract-topic.
|
| 465 |
+
- Report similarity score and alignment status.
|
| 466 |
+
|
| 467 |
+
Parameters
|
| 468 |
+
----------
|
| 469 |
+
title_topics : pd.DataFrame
|
| 470 |
+
Topics extracted from titles (with 'topic_id', 'keywords', 'label').
|
| 471 |
+
abstract_topics : pd.DataFrame
|
| 472 |
+
Topics extracted from abstracts (with 'topic_id', 'keywords', 'label').
|
| 473 |
+
|
| 474 |
+
Returns
|
| 475 |
+
-------
|
| 476 |
+
pd.DataFrame
|
| 477 |
+
Comparison table with columns:
|
| 478 |
+
title_topic_id, title_label, title_keywords,
|
| 479 |
+
abstract_topic_id, abstract_label, abstract_keywords,
|
| 480 |
+
similarity, alignment
|
| 481 |
+
"""
|
| 482 |
+
logger.info("Comparing themes: %d title topics × %d abstract topics",
|
| 483 |
+
len(title_topics), len(abstract_topics))
|
| 484 |
+
|
| 485 |
+
def _keywords_set(kw_str: str) -> set:
|
| 486 |
+
return set(kw.strip().lower() for kw in kw_str.split(",") if kw.strip())
|
| 487 |
+
|
| 488 |
+
rows = []
|
| 489 |
+
for _, t_row in title_topics.iterrows():
|
| 490 |
+
t_kws = _keywords_set(t_row["keywords"])
|
| 491 |
+
best_sim = 0.0
|
| 492 |
+
best_match = None
|
| 493 |
+
|
| 494 |
+
for _, a_row in abstract_topics.iterrows():
|
| 495 |
+
a_kws = _keywords_set(a_row["keywords"])
|
| 496 |
+
if not t_kws or not a_kws:
|
| 497 |
+
continue
|
| 498 |
+
# Jaccard similarity
|
| 499 |
+
intersection = len(t_kws & a_kws)
|
| 500 |
+
union = len(t_kws | a_kws)
|
| 501 |
+
sim = intersection / union if union else 0.0
|
| 502 |
+
if sim > best_sim:
|
| 503 |
+
best_sim = sim
|
| 504 |
+
best_match = a_row
|
| 505 |
+
|
| 506 |
+
alignment = (
|
| 507 |
+
"Strong" if best_sim >= 0.4
|
| 508 |
+
else "Moderate" if best_sim >= 0.2
|
| 509 |
+
else "Weak" if best_sim > 0
|
| 510 |
+
else "No Match"
|
| 511 |
+
)
|
| 512 |
+
|
| 513 |
+
rows.append({
|
| 514 |
+
"title_topic_id": t_row["topic_id"],
|
| 515 |
+
"title_label": t_row.get("label", ""),
|
| 516 |
+
"title_keywords": t_row["keywords"],
|
| 517 |
+
"abstract_topic_id": best_match["topic_id"] if best_match is not None else None,
|
| 518 |
+
"abstract_label": best_match.get("label", "") if best_match is not None else "",
|
| 519 |
+
"abstract_keywords": best_match["keywords"] if best_match is not None else "",
|
| 520 |
+
"similarity": round(best_sim, 4),
|
| 521 |
+
"alignment": alignment,
|
| 522 |
+
})
|
| 523 |
+
|
| 524 |
+
comparison_df = pd.DataFrame(rows)
|
| 525 |
+
logger.info("Theme comparison complete: %d rows", len(comparison_df))
|
| 526 |
+
return comparison_df
|
| 527 |
+
|
| 528 |
+
|
| 529 |
+
# ===================================================================
|
| 530 |
+
# 6. create_taxonomy_map — Classify themes as MAPPED or NOVEL
|
| 531 |
+
# ===================================================================
|
| 532 |
+
def create_taxonomy_map(
|
| 533 |
+
topics_df: pd.DataFrame,
|
| 534 |
+
known_themes: Optional[List[str]] = None,
|
| 535 |
+
threshold: float = 0.15,
|
| 536 |
+
) -> Dict[str, Any]:
|
| 537 |
+
"""
|
| 538 |
+
Classify each topic as either MAPPED (similar to a well-known
|
| 539 |
+
AI / business / IS research theme) or NOVEL (previously unseen).
|
| 540 |
+
|
| 541 |
+
Heuristic:
|
| 542 |
+
For each topic's keyword set, compute its best token-overlap
|
| 543 |
+
ratio against the known themes list. If the ratio exceeds the
|
| 544 |
+
threshold, label it as MAPPED; otherwise NOVEL.
|
| 545 |
+
|
| 546 |
+
Parameters
|
| 547 |
+
----------
|
| 548 |
+
topics_df : pd.DataFrame
|
| 549 |
+
Must contain 'topic_id', 'keywords', and 'label' columns.
|
| 550 |
+
known_themes : list of str, optional
|
| 551 |
+
Reference themes (defaults to the built-in KNOWN_THEMES).
|
| 552 |
+
threshold : float
|
| 553 |
+
Minimum overlap ratio to classify as MAPPED.
|
| 554 |
+
|
| 555 |
+
Returns
|
| 556 |
+
-------
|
| 557 |
+
dict
|
| 558 |
+
JSON-serialisable taxonomy map:
|
| 559 |
+
{
|
| 560 |
+
"metadata": { ... },
|
| 561 |
+
"mapped": [ {topic_id, label, keywords, matched_theme, score}, ... ],
|
| 562 |
+
"novel": [ {topic_id, label, keywords, score}, ... ],
|
| 563 |
+
}
|
| 564 |
+
"""
|
| 565 |
+
if known_themes is None:
|
| 566 |
+
known_themes = KNOWN_THEMES
|
| 567 |
+
|
| 568 |
+
logger.info("Building taxonomy map for %d topics (threshold=%.2f)", len(topics_df), threshold)
|
| 569 |
+
|
| 570 |
+
mapped: List[Dict] = []
|
| 571 |
+
novel: List[Dict] = []
|
| 572 |
+
|
| 573 |
+
known_tokens_list = [set(theme.lower().split()) for theme in known_themes]
|
| 574 |
+
|
| 575 |
+
for _, row in topics_df.iterrows():
|
| 576 |
+
topic_tokens = set(
|
| 577 |
+
kw.strip().lower()
|
| 578 |
+
for kw in row["keywords"].split(",")
|
| 579 |
+
if kw.strip()
|
| 580 |
+
)
|
| 581 |
+
# Also include individual words from multi-word keywords
|
| 582 |
+
expanded_tokens = set()
|
| 583 |
+
for token in topic_tokens:
|
| 584 |
+
expanded_tokens.update(token.split())
|
| 585 |
+
expanded_tokens.update(topic_tokens)
|
| 586 |
+
|
| 587 |
+
best_score = 0.0
|
| 588 |
+
best_theme = ""
|
| 589 |
+
for theme_str, theme_tokens in zip(known_themes, known_tokens_list):
|
| 590 |
+
if not expanded_tokens or not theme_tokens:
|
| 591 |
+
continue
|
| 592 |
+
intersection = len(expanded_tokens & theme_tokens)
|
| 593 |
+
union_size = len(expanded_tokens | theme_tokens)
|
| 594 |
+
score = intersection / union_size if union_size else 0.0
|
| 595 |
+
if score > best_score:
|
| 596 |
+
best_score = score
|
| 597 |
+
best_theme = theme_str
|
| 598 |
+
|
| 599 |
+
entry = {
|
| 600 |
+
"topic_id": int(row["topic_id"]),
|
| 601 |
+
"label": row.get("label", ""),
|
| 602 |
+
"keywords": row["keywords"],
|
| 603 |
+
"score": round(best_score, 4),
|
| 604 |
+
}
|
| 605 |
+
|
| 606 |
+
if best_score >= threshold:
|
| 607 |
+
entry["matched_theme"] = best_theme
|
| 608 |
+
entry["classification"] = "MAPPED"
|
| 609 |
+
mapped.append(entry)
|
| 610 |
+
else:
|
| 611 |
+
entry["classification"] = "NOVEL"
|
| 612 |
+
novel.append(entry)
|
| 613 |
+
|
| 614 |
+
taxonomy = {
|
| 615 |
+
"metadata": {
|
| 616 |
+
"total_topics": len(topics_df),
|
| 617 |
+
"mapped_count": len(mapped),
|
| 618 |
+
"novel_count": len(novel),
|
| 619 |
+
"threshold": threshold,
|
| 620 |
+
},
|
| 621 |
+
"mapped": mapped,
|
| 622 |
+
"novel": novel,
|
| 623 |
+
}
|
| 624 |
+
|
| 625 |
+
logger.info("Taxonomy: %d MAPPED, %d NOVEL", len(mapped), len(novel))
|
| 626 |
+
return taxonomy
|