Enayut commited on
Commit
ed51280
·
verified ·
1 Parent(s): f7f4f4c

Upload 7 files

Browse files
Files changed (8) hide show
  1. .gitattributes +1 -0
  2. Dockerfile +41 -0
  3. README.md +10 -0
  4. agent.py +287 -0
  5. app.py +1191 -0
  6. dataset.csv +3 -0
  7. requirements.txt +16 -0
  8. tools.py +626 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ dataset.csv filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ # Set working directory
4
+ WORKDIR /app
5
+
6
+ # Install system dependencies needed by some Python packages
7
+ RUN apt-get update && apt-get install -y --no-install-recommends \
8
+ build-essential \
9
+ && rm -rf /var/lib/apt/lists/*
10
+
11
+ # Copy requirements first (Docker layer caching)
12
+ COPY requirements.txt .
13
+ RUN pip install --no-cache-dir -r requirements.txt
14
+
15
+ # Download NLTK data at build time so it's baked into the image
16
+ RUN python -c "import nltk; nltk.download('punkt'); nltk.download('punkt_tab'); nltk.download('stopwords')"
17
+
18
+ # Copy application code
19
+ COPY app.py .
20
+ COPY agent.py .
21
+ COPY tools.py .
22
+
23
+ # Copy dataset (bundled as the default dataset)
24
+ COPY dataset.csv .
25
+
26
+ # Create writable outputs directory
27
+ RUN mkdir -p /app/outputs && chmod 777 /app/outputs
28
+
29
+ # Expose Streamlit port
30
+ EXPOSE 8501
31
+
32
+ # Health check
33
+ HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health || exit 1
34
+
35
+ # Run Streamlit
36
+ CMD ["streamlit", "run", "app.py", \
37
+ "--server.port=8501", \
38
+ "--server.address=0.0.0.0", \
39
+ "--server.headless=true", \
40
+ "--browser.gatherUsageStats=false", \
41
+ "--server.fileWatcherType=none"]
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Research Topic Modeler
3
+ emoji: 🔬
4
+ colorFrom: indigo
5
+ colorTo: purple
6
+ sdk: docker
7
+ app_port: 8501
8
+ pinned: false
9
+ license: mit
10
+ ---
agent.py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ agent.py — TopicAgent orchestrates the end-to-end topic modeling workflow.
3
+
4
+ This module defines the TopicAgent class, which:
5
+ 1. Loads and validates the CSV dataset.
6
+ 2. Preprocesses text for Titles and Abstracts separately.
7
+ 3. Runs topic modeling on each corpus (≥100 topics guaranteed).
8
+ 4. Generates human-readable labels for every topic.
9
+ 5. Compares dominant themes across Title and Abstract topics.
10
+ 6. Produces a taxonomy map (MAPPED / NOVEL classification).
11
+ 7. Exports structured outputs: topics table, comparison CSV, taxonomy JSON.
12
+
13
+ Usage:
14
+ agent = TopicAgent(csv_path="dataset.csv")
15
+ results = agent.run()
16
+ """
17
+
18
+ import os
19
+ import json
20
+ import logging
21
+ from dataclasses import dataclass, field
22
+ from typing import Dict, Any, Optional
23
+
24
+ import pandas as pd
25
+
26
+ from tools import (
27
+ load_csv,
28
+ preprocess_text,
29
+ run_topic_modeling,
30
+ generate_labels,
31
+ compare_themes,
32
+ create_taxonomy_map,
33
+ )
34
+
35
+ # ---------------------------------------------------------------------------
36
+ # Logging
37
+ # ---------------------------------------------------------------------------
38
+ logger = logging.getLogger(__name__)
39
+
40
+
41
+ # ---------------------------------------------------------------------------
42
+ # Structured result container
43
+ # ---------------------------------------------------------------------------
44
+ @dataclass
45
+ class AgentResult:
46
+ """Container for all outputs produced by the TopicAgent."""
47
+ # Core dataframes
48
+ title_topics: pd.DataFrame = field(default_factory=pd.DataFrame)
49
+ abstract_topics: pd.DataFrame = field(default_factory=pd.DataFrame)
50
+ combined_topics: pd.DataFrame = field(default_factory=pd.DataFrame)
51
+ comparison: pd.DataFrame = field(default_factory=pd.DataFrame)
52
+
53
+ # Taxonomy map (dict serialisable to JSON)
54
+ taxonomy_map: Dict[str, Any] = field(default_factory=dict)
55
+
56
+ # Execution metadata
57
+ status: str = "pending"
58
+ steps_completed: list = field(default_factory=list)
59
+ errors: list = field(default_factory=list)
60
+
61
+ # File paths of exported artefacts
62
+ exported_files: Dict[str, str] = field(default_factory=dict)
63
+
64
+
65
+ # ---------------------------------------------------------------------------
66
+ # TopicAgent
67
+ # ---------------------------------------------------------------------------
68
+ class TopicAgent:
69
+ """
70
+ Orchestrates the research-paper topic modeling pipeline.
71
+
72
+ Parameters
73
+ ----------
74
+ csv_path : str
75
+ Path to the input CSV file.
76
+ output_dir : str
77
+ Directory to write output files.
78
+ min_topics : int
79
+ Minimum number of topics to generate per source (default 100).
80
+ use_llm_labels : bool
81
+ Whether to use Groq LLM for label generation.
82
+ groq_api_key : str, optional
83
+ API key for Groq (used only when use_llm_labels is True).
84
+ """
85
+
86
+ def __init__(
87
+ self,
88
+ csv_path: str,
89
+ output_dir: str = "outputs",
90
+ min_topics: int = 100,
91
+ use_llm_labels: bool = False,
92
+ groq_api_key: Optional[str] = None,
93
+ ):
94
+ self.csv_path = csv_path
95
+ self.output_dir = output_dir
96
+ self.min_topics = min_topics
97
+ self.use_llm_labels = use_llm_labels
98
+ self.groq_api_key = groq_api_key
99
+
100
+ # Ensure output directory exists
101
+ os.makedirs(self.output_dir, exist_ok=True)
102
+
103
+ self._result = AgentResult()
104
+
105
+ # -----------------------------------------------------------------
106
+ # Public interface
107
+ # -----------------------------------------------------------------
108
+ def run(self) -> AgentResult:
109
+ """
110
+ Execute the full pipeline step by step.
111
+
112
+ Returns
113
+ -------
114
+ AgentResult
115
+ Structured results including all DataFrames, taxonomy, and file paths.
116
+ """
117
+ logger.info("=" * 60)
118
+ logger.info("TopicAgent — Starting pipeline")
119
+ logger.info("=" * 60)
120
+
121
+ try:
122
+ # Step 1: Load CSV
123
+ self._step_load_csv()
124
+
125
+ # Step 2: Preprocess text
126
+ self._step_preprocess()
127
+
128
+ # Step 3: Topic modeling on Titles
129
+ self._step_model_titles()
130
+
131
+ # Step 4: Topic modeling on Abstracts
132
+ self._step_model_abstracts()
133
+
134
+ # Step 5: Generate labels
135
+ self._step_generate_labels()
136
+
137
+ # Step 6: Build combined topics table
138
+ self._step_combine_topics()
139
+
140
+ # Step 7: Compare themes
141
+ self._step_compare_themes()
142
+
143
+ # Step 8: Create taxonomy map
144
+ self._step_taxonomy_map()
145
+
146
+ # Step 9: Export outputs
147
+ self._step_export()
148
+
149
+ self._result.status = "success"
150
+ logger.info("Pipeline completed successfully.")
151
+
152
+ except Exception as exc:
153
+ self._result.status = "failed"
154
+ self._result.errors.append(str(exc))
155
+ logger.error("Pipeline failed: %s", exc, exc_info=True)
156
+
157
+ return self._result
158
+
159
+ # -----------------------------------------------------------------
160
+ # Pipeline steps
161
+ # -----------------------------------------------------------------
162
+ def _step_load_csv(self):
163
+ """Step 1 — Ingest CSV dataset."""
164
+ logger.info("Step 1/9: Loading CSV …")
165
+ self._df = load_csv(self.csv_path)
166
+ self._result.steps_completed.append("load_csv")
167
+ logger.info(" → %d papers loaded.", len(self._df))
168
+
169
+ def _step_preprocess(self):
170
+ """Step 2 — Preprocess Title and Abstract text."""
171
+ logger.info("Step 2/9: Preprocessing text …")
172
+ self._titles_clean = preprocess_text(self._df["Title"].tolist())
173
+ self._abstracts_clean = preprocess_text(self._df["Abstract"].tolist())
174
+ self._result.steps_completed.append("preprocess_text")
175
+ logger.info(" → Titles preprocessed: %d docs", len(self._titles_clean))
176
+ logger.info(" → Abstracts preprocessed: %d docs", len(self._abstracts_clean))
177
+
178
+ def _step_model_titles(self):
179
+ """Step 3 — Topic modeling on Titles."""
180
+ logger.info("Step 3/9: Topic modeling on Titles …")
181
+ self._title_topics_df, self._title_model = run_topic_modeling(
182
+ self._titles_clean,
183
+ source_label="Titles",
184
+ min_topics=self.min_topics,
185
+ )
186
+ self._result.steps_completed.append("topic_modeling_titles")
187
+ logger.info(" → %d title topics discovered.", len(self._title_topics_df))
188
+
189
+ def _step_model_abstracts(self):
190
+ """Step 4 — Topic modeling on Abstracts."""
191
+ logger.info("Step 4/9: Topic modeling on Abstracts …")
192
+ self._abstract_topics_df, self._abstract_model = run_topic_modeling(
193
+ self._abstracts_clean,
194
+ source_label="Abstracts",
195
+ min_topics=self.min_topics,
196
+ )
197
+ self._result.steps_completed.append("topic_modeling_abstracts")
198
+ logger.info(" → %d abstract topics discovered.", len(self._abstract_topics_df))
199
+
200
+ def _step_generate_labels(self):
201
+ """Step 5 — Generate human-readable labels."""
202
+ logger.info("Step 5/9: Generating topic labels …")
203
+ self._title_topics_df = generate_labels(
204
+ self._title_topics_df,
205
+ use_llm=self.use_llm_labels,
206
+ groq_api_key=self.groq_api_key,
207
+ )
208
+ self._abstract_topics_df = generate_labels(
209
+ self._abstract_topics_df,
210
+ use_llm=self.use_llm_labels,
211
+ groq_api_key=self.groq_api_key,
212
+ )
213
+ self._result.title_topics = self._title_topics_df
214
+ self._result.abstract_topics = self._abstract_topics_df
215
+ self._result.steps_completed.append("generate_labels")
216
+ logger.info(" → Labels generated for all topics.")
217
+
218
+ def _step_combine_topics(self):
219
+ """Step 6 — Combine title and abstract topics into one table."""
220
+ logger.info("Step 6/9: Building combined topics table …")
221
+ combined = pd.concat(
222
+ [self._title_topics_df, self._abstract_topics_df],
223
+ ignore_index=True,
224
+ )
225
+ combined["global_id"] = range(len(combined))
226
+ self._result.combined_topics = combined
227
+ self._result.steps_completed.append("combine_topics")
228
+ logger.info(" → Combined table: %d topics total.", len(combined))
229
+
230
+ def _step_compare_themes(self):
231
+ """Step 7 — Compare title vs abstract themes."""
232
+ logger.info("Step 7/9: Comparing title vs abstract themes …")
233
+ comparison = compare_themes(self._title_topics_df, self._abstract_topics_df)
234
+ self._result.comparison = comparison
235
+ self._result.steps_completed.append("compare_themes")
236
+ logger.info(" → Comparison table: %d rows.", len(comparison))
237
+
238
+ def _step_taxonomy_map(self):
239
+ """Step 8 — Create taxonomy map (MAPPED / NOVEL)."""
240
+ logger.info("Step 8/9: Building taxonomy map …")
241
+ # Use the combined topics for taxonomy
242
+ taxonomy = create_taxonomy_map(self._result.combined_topics)
243
+ self._result.taxonomy_map = taxonomy
244
+ self._result.steps_completed.append("create_taxonomy_map")
245
+ logger.info(
246
+ " → MAPPED: %d, NOVEL: %d",
247
+ taxonomy["metadata"]["mapped_count"],
248
+ taxonomy["metadata"]["novel_count"],
249
+ )
250
+
251
+ def _step_export(self):
252
+ """Step 9 — Export all outputs to disk."""
253
+ logger.info("Step 9/9: Exporting outputs …")
254
+
255
+ # (a) Combined topics table CSV
256
+ topics_path = os.path.join(self.output_dir, "topics_table.csv")
257
+ self._result.combined_topics.to_csv(topics_path, index=False)
258
+ self._result.exported_files["topics_table"] = topics_path
259
+ logger.info(" → Saved: %s", topics_path)
260
+
261
+ # (b) Comparison CSV
262
+ comparison_path = os.path.join(self.output_dir, "comparison.csv")
263
+ self._result.comparison.to_csv(comparison_path, index=False)
264
+ self._result.exported_files["comparison"] = comparison_path
265
+ logger.info(" → Saved: %s", comparison_path)
266
+
267
+ # (c) Taxonomy map JSON
268
+ taxonomy_path = os.path.join(self.output_dir, "taxonomy_map.json")
269
+ with open(taxonomy_path, "w", encoding="utf-8") as f:
270
+ json.dump(self._result.taxonomy_map, f, indent=2, ensure_ascii=False)
271
+ self._result.exported_files["taxonomy_map"] = taxonomy_path
272
+ logger.info(" → Saved: %s", taxonomy_path)
273
+
274
+ # (d) Title topics CSV
275
+ title_path = os.path.join(self.output_dir, "title_topics.csv")
276
+ self._result.title_topics.to_csv(title_path, index=False)
277
+ self._result.exported_files["title_topics"] = title_path
278
+ logger.info(" → Saved: %s", title_path)
279
+
280
+ # (e) Abstract topics CSV
281
+ abstract_path = os.path.join(self.output_dir, "abstract_topics.csv")
282
+ self._result.abstract_topics.to_csv(abstract_path, index=False)
283
+ self._result.exported_files["abstract_topics"] = abstract_path
284
+ logger.info(" → Saved: %s", abstract_path)
285
+
286
+ self._result.steps_completed.append("export")
287
+ logger.info(" → All outputs exported successfully.")
app.py ADDED
@@ -0,0 +1,1191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ app.py — Streamlit frontend for the AI-driven Topic Modeling application.
3
+
4
+ This module provides an interactive web interface that allows users to:
5
+ 1. Upload a CSV file containing research paper Titles and Abstracts.
6
+ 2. Configure pipeline parameters (min topics, LLM label generation).
7
+ 3. Run the TopicAgent pipeline with a single click.
8
+ 4. View and explore results: topics table, comparison, taxonomy map.
9
+ 5. Review topics with an editable review table.
10
+ 6. Visualize topic distributions with interactive Plotly charts.
11
+ 7. Download all generated outputs (CSV, JSON).
12
+ """
13
+
14
+ import os
15
+ import json
16
+ import tempfile
17
+
18
+ import streamlit as st
19
+ import pandas as pd
20
+ import plotly.express as px
21
+ import plotly.graph_objects as go
22
+
23
+ from agent import TopicAgent
24
+
25
+ # ---------------------------------------------------------------------------
26
+ # HuggingFace Spaces compatibility: use a writable output directory
27
+ # On HF Spaces the working directory can be read-only, so fall back to /tmp
28
+ # ---------------------------------------------------------------------------
29
+ OUTPUT_DIR = "outputs"
30
+ try:
31
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
32
+ # Test write access
33
+ _test_path = os.path.join(OUTPUT_DIR, ".write_test")
34
+ with open(_test_path, "w") as _f:
35
+ _f.write("ok")
36
+ os.remove(_test_path)
37
+ except (OSError, PermissionError):
38
+ OUTPUT_DIR = os.path.join(tempfile.gettempdir(), "topic_modeler_outputs")
39
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
40
+
41
+ # ---------------------------------------------------------------------------
42
+ # Page configuration
43
+ # ---------------------------------------------------------------------------
44
+ st.set_page_config(
45
+ page_title="Research Topic Modeler — AI Agent",
46
+ page_icon="🔬",
47
+ layout="wide",
48
+ initial_sidebar_state="expanded",
49
+ )
50
+
51
+ # ---------------------------------------------------------------------------
52
+ # Custom CSS for a polished, professional look with dark-safe text colors
53
+ # ---------------------------------------------------------------------------
54
+ st.markdown("""
55
+ <style>
56
+ /* Import Google Font */
57
+ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
58
+
59
+ /* Global */
60
+ html, body, [class*="css"] {
61
+ font-family: 'Inter', sans-serif;
62
+ }
63
+
64
+ /* Header gradient banner */
65
+ .main-header {
66
+ background: linear-gradient(135deg, #0f0c29 0%, #302b63 50%, #24243e 100%);
67
+ padding: 2rem 2.5rem;
68
+ border-radius: 16px;
69
+ margin-bottom: 1.5rem;
70
+ box-shadow: 0 8px 32px rgba(48, 43, 99, 0.3);
71
+ }
72
+ .main-header h1 {
73
+ color: #ffffff;
74
+ font-size: 2.2rem;
75
+ font-weight: 700;
76
+ margin: 0;
77
+ letter-spacing: -0.5px;
78
+ }
79
+ .main-header p {
80
+ color: #b8b5ff;
81
+ font-size: 1.05rem;
82
+ margin: 0.5rem 0 0 0;
83
+ font-weight: 300;
84
+ }
85
+
86
+ /* Stat cards */
87
+ .stat-card {
88
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
89
+ padding: 1.25rem 1.5rem;
90
+ border-radius: 12px;
91
+ color: white;
92
+ text-align: center;
93
+ box-shadow: 0 4px 15px rgba(102, 126, 234, 0.3);
94
+ transition: transform 0.2s ease;
95
+ }
96
+ .stat-card:hover {
97
+ transform: translateY(-2px);
98
+ }
99
+ .stat-card .stat-value {
100
+ font-size: 2rem;
101
+ font-weight: 700;
102
+ line-height: 1.2;
103
+ color: #ffffff;
104
+ }
105
+ .stat-card .stat-label {
106
+ font-size: 0.85rem;
107
+ opacity: 0.85;
108
+ margin-top: 0.3rem;
109
+ font-weight: 400;
110
+ color: #e8e6ff;
111
+ }
112
+
113
+ /* Status badge */
114
+ .status-badge {
115
+ display: inline-block;
116
+ padding: 0.3rem 1rem;
117
+ border-radius: 20px;
118
+ font-size: 0.8rem;
119
+ font-weight: 600;
120
+ text-transform: uppercase;
121
+ letter-spacing: 0.5px;
122
+ }
123
+ .status-success {
124
+ background: linear-gradient(135deg, #11998e, #38ef7d);
125
+ color: #ffffff;
126
+ }
127
+ .status-failed {
128
+ background: linear-gradient(135deg, #eb3349, #f45c43);
129
+ color: #ffffff;
130
+ }
131
+ .status-running {
132
+ background: linear-gradient(135deg, #f7971e, #ffd200);
133
+ color: #1a1a2e;
134
+ }
135
+
136
+ /* Section headers — always readable on both light and dark backgrounds */
137
+ .section-header {
138
+ font-size: 1.3rem;
139
+ font-weight: 600;
140
+ color: #c4b5fd;
141
+ margin: 1.5rem 0 0.75rem 0;
142
+ padding-bottom: 0.5rem;
143
+ border-bottom: 2px solid #667eea;
144
+ display: inline-block;
145
+ }
146
+
147
+ /* Taxonomy badges */
148
+ .mapped-badge {
149
+ display: inline-block;
150
+ background: linear-gradient(135deg, #11998e, #38ef7d);
151
+ color: #ffffff;
152
+ padding: 0.2rem 0.7rem;
153
+ border-radius: 12px;
154
+ font-size: 0.75rem;
155
+ font-weight: 600;
156
+ }
157
+ .novel-badge {
158
+ display: inline-block;
159
+ background: linear-gradient(135deg, #fc4a1a, #f7b733);
160
+ color: #ffffff;
161
+ padding: 0.2rem 0.7rem;
162
+ border-radius: 12px;
163
+ font-size: 0.75rem;
164
+ font-weight: 600;
165
+ }
166
+
167
+ /* Sidebar styling */
168
+ section[data-testid="stSidebar"] {
169
+ background: linear-gradient(180deg, #1a1a2e 0%, #16213e 100%);
170
+ }
171
+ section[data-testid="stSidebar"] .stMarkdown {
172
+ color: #e0e0e0;
173
+ }
174
+ section[data-testid="stSidebar"] label {
175
+ color: #e0e0e0 !important;
176
+ }
177
+ section[data-testid="stSidebar"] .stSlider label {
178
+ color: #e0e0e0 !important;
179
+ }
180
+
181
+ /* Data table enhancements */
182
+ .stDataFrame {
183
+ border-radius: 8px;
184
+ overflow: hidden;
185
+ }
186
+
187
+ /* Info box — dark-safe: dark background with light text */
188
+ .info-box {
189
+ background: linear-gradient(135deg, #1e1e3f 0%, #2d2b55 100%);
190
+ padding: 1rem 1.5rem;
191
+ border-radius: 10px;
192
+ border-left: 4px solid #667eea;
193
+ margin: 0.75rem 0;
194
+ color: #e0e0e0;
195
+ }
196
+ .info-box strong {
197
+ color: #ffffff;
198
+ }
199
+ .info-box code {
200
+ background: rgba(102, 126, 234, 0.2);
201
+ color: #b8b5ff;
202
+ padding: 0.1rem 0.4rem;
203
+ border-radius: 4px;
204
+ }
205
+
206
+ /* Pipeline step */
207
+ .step-item {
208
+ padding: 0.5rem 1rem;
209
+ margin: 0.3rem 0;
210
+ border-radius: 8px;
211
+ background: rgba(102, 126, 234, 0.15);
212
+ border-left: 3px solid #667eea;
213
+ font-size: 0.9rem;
214
+ color: #e0e0e0;
215
+ }
216
+
217
+ /* Chart container styling */
218
+ .chart-container {
219
+ background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
220
+ border-radius: 12px;
221
+ padding: 1rem;
222
+ margin: 0.5rem 0;
223
+ box-shadow: 0 4px 15px rgba(0, 0, 0, 0.2);
224
+ }
225
+
226
+ /* Review section header */
227
+ .review-header {
228
+ background: linear-gradient(135deg, #11998e 0%, #38ef7d 100%);
229
+ padding: 1rem 1.5rem;
230
+ border-radius: 12px;
231
+ margin-bottom: 1rem;
232
+ box-shadow: 0 4px 15px rgba(17, 153, 142, 0.3);
233
+ }
234
+ .review-header h3 {
235
+ color: #ffffff;
236
+ margin: 0;
237
+ font-weight: 600;
238
+ }
239
+ .review-header p {
240
+ color: #e0fff8;
241
+ margin: 0.3rem 0 0 0;
242
+ font-size: 0.9rem;
243
+ }
244
+
245
+ /* Save confirmation */
246
+ .save-confirm {
247
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
248
+ color: #ffffff;
249
+ padding: 0.75rem 1.25rem;
250
+ border-radius: 10px;
251
+ margin-top: 0.5rem;
252
+ font-weight: 500;
253
+ }
254
+
255
+ /* Ensure tab labels are readable */
256
+ .stTabs [data-baseweb="tab-list"] button {
257
+ color: #c4b5fd;
258
+ }
259
+ .stTabs [data-baseweb="tab-list"] button[aria-selected="true"] {
260
+ color: #ffffff;
261
+ }
262
+ </style>
263
+ """, unsafe_allow_html=True)
264
+
265
+
266
+ # ---------------------------------------------------------------------------
267
+ # Header
268
+ # ---------------------------------------------------------------------------
269
+ st.markdown("""
270
+ <div class="main-header">
271
+ <h1>🔬 Research Topic Modeler</h1>
272
+ <p>AI-powered topic modeling agent for research papers — discover, compare, and classify themes across Titles and Abstracts</p>
273
+ </div>
274
+ """, unsafe_allow_html=True)
275
+
276
+
277
+ # ---------------------------------------------------------------------------
278
+ # Sidebar — Configuration
279
+ # ---------------------------------------------------------------------------
280
+ with st.sidebar:
281
+ st.markdown("## ⚙️ Configuration")
282
+ st.markdown("---")
283
+
284
+ # File upload
285
+ st.markdown("### 📁 Dataset")
286
+ uploaded_file = st.file_uploader(
287
+ "Upload CSV with Title & Abstract columns",
288
+ type=["csv"],
289
+ help="The CSV must contain at least 'Title' and 'Abstract' columns.",
290
+ )
291
+
292
+ # Or use default dataset
293
+ use_default = st.checkbox(
294
+ "Use default dataset (dataset.csv)",
295
+ value=True if not uploaded_file else False,
296
+ help="Use the bundled dataset.csv file in the project directory.",
297
+ )
298
+
299
+ st.markdown("---")
300
+ st.markdown("### 🎯 Parameters")
301
+
302
+ min_topics = st.slider(
303
+ "Minimum Topics",
304
+ min_value=50,
305
+ max_value=200,
306
+ value=100,
307
+ step=10,
308
+ help="Minimum number of topics to generate per source (Titles / Abstracts).",
309
+ )
310
+
311
+ use_llm = st.checkbox(
312
+ "🤖 Use LLM for Label Generation (Groq)",
313
+ value=False,
314
+ help="Use Groq's LLaMA model to generate contextual topic labels. "
315
+ "Falls back to keyword heuristic if unchecked.",
316
+ )
317
+
318
+ groq_key = os.environ.get("GROQ_API_KEY", "")
319
+ if use_llm:
320
+ groq_key = st.text_input(
321
+ "Groq API Key",
322
+ value=groq_key,
323
+ type="password",
324
+ help="Your Groq API key for LLM label generation.",
325
+ )
326
+
327
+ st.markdown("---")
328
+ st.markdown("### 📋 Pipeline Steps")
329
+ steps_info = [
330
+ "1. Load & validate CSV",
331
+ "2. Preprocess text (Titles + Abstracts)",
332
+ "3. Topic modeling — Titles (≥{} topics)".format(min_topics),
333
+ "4. Topic modeling — Abstracts (≥{} topics)".format(min_topics),
334
+ "5. Generate human-readable labels",
335
+ "6. Combine topics table",
336
+ "7. Compare themes (Title vs Abstract)",
337
+ "8. Build taxonomy map (MAPPED / NOVEL)",
338
+ "9. Export outputs (CSV, JSON)",
339
+ ]
340
+ for step in steps_info:
341
+ st.markdown(f'<div class="step-item">{step}</div>', unsafe_allow_html=True)
342
+
343
+
344
+ # ---------------------------------------------------------------------------
345
+ # Main area — Run button and results
346
+ # ---------------------------------------------------------------------------
347
+ col_run, col_status = st.columns([2, 3])
348
+
349
+ with col_run:
350
+ run_clicked = st.button("🚀 Run Topic Modeling Agent", use_container_width=True, type="primary")
351
+
352
+ with col_status:
353
+ if "result" in st.session_state and st.session_state.result is not None:
354
+ res = st.session_state.result
355
+ if res.status == "success":
356
+ st.markdown('<span class="status-badge status-success">✓ Pipeline Complete</span>', unsafe_allow_html=True)
357
+ elif res.status == "failed":
358
+ st.markdown('<span class="status-badge status-failed">✗ Pipeline Failed</span>', unsafe_allow_html=True)
359
+ else:
360
+ st.markdown('<span class="status-badge status-running">● Awaiting Input</span>', unsafe_allow_html=True)
361
+
362
+
363
+ # ---------------------------------------------------------------------------
364
+ # Execute pipeline
365
+ # ---------------------------------------------------------------------------
366
+ if run_clicked:
367
+ # Determine CSV path
368
+ csv_path = None
369
+ if uploaded_file is not None:
370
+ # Save uploaded file to a temp location
371
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".csv", dir=".") as tmp:
372
+ tmp.write(uploaded_file.getvalue())
373
+ csv_path = tmp.name
374
+ elif use_default:
375
+ csv_path = "dataset.csv"
376
+ if not os.path.exists(csv_path):
377
+ st.error("❌ Default dataset.csv not found in the project directory.")
378
+ st.stop()
379
+ else:
380
+ st.error("❌ Please upload a CSV file or select the default dataset.")
381
+ st.stop()
382
+
383
+ # Run the agent
384
+ with st.spinner("🔄 Running the Topic Modeling Agent … this may take a few minutes."):
385
+ progress = st.progress(0, text="Initializing …")
386
+
387
+ agent = TopicAgent(
388
+ csv_path=csv_path,
389
+ output_dir=OUTPUT_DIR,
390
+ min_topics=min_topics,
391
+ use_llm_labels=use_llm,
392
+ groq_api_key=groq_key if use_llm else None,
393
+ )
394
+
395
+ # Display step-by-step progress
396
+ progress.progress(5, text="Step 1/9: Loading CSV …")
397
+ agent._step_load_csv()
398
+ progress.progress(10, text="Step 2/9: Preprocessing text …")
399
+ agent._step_preprocess()
400
+ progress.progress(20, text="Step 3/9: Topic modeling on Titles …")
401
+ agent._step_model_titles()
402
+ progress.progress(45, text="Step 4/9: Topic modeling on Abstracts …")
403
+ agent._step_model_abstracts()
404
+ progress.progress(65, text="Step 5/9: Generating topic labels …")
405
+ agent._step_generate_labels()
406
+ progress.progress(75, text="Step 6/9: Building combined topics table …")
407
+ agent._step_combine_topics()
408
+ progress.progress(80, text="Step 7/9: Comparing themes …")
409
+ agent._step_compare_themes()
410
+ progress.progress(90, text="Step 8/9: Building taxonomy map …")
411
+ agent._step_taxonomy_map()
412
+ progress.progress(95, text="Step 9/9: Exporting outputs …")
413
+ agent._step_export()
414
+
415
+ agent._result.status = "success"
416
+ progress.progress(100, text="✅ Pipeline complete!")
417
+
418
+ st.session_state.result = agent._result
419
+
420
+ # Clean up temp file
421
+ if uploaded_file is not None and csv_path and os.path.exists(csv_path):
422
+ try:
423
+ os.unlink(csv_path)
424
+ except Exception:
425
+ pass
426
+
427
+ st.rerun()
428
+
429
+
430
+ # ---------------------------------------------------------------------------
431
+ # Helper: Plotly chart theme (dark background, readable text)
432
+ # ---------------------------------------------------------------------------
433
+ PLOTLY_LAYOUT = dict(
434
+ paper_bgcolor="rgba(26, 26, 46, 0.95)",
435
+ plot_bgcolor="rgba(22, 33, 62, 0.95)",
436
+ font=dict(family="Inter, sans-serif", size=13, color="#e0e0e0"),
437
+ title_font=dict(size=18, color="#ffffff"),
438
+ legend=dict(
439
+ font=dict(color="#e0e0e0"),
440
+ bgcolor="rgba(26, 26, 46, 0.7)",
441
+ bordercolor="#667eea",
442
+ borderwidth=1,
443
+ ),
444
+ xaxis=dict(
445
+ gridcolor="rgba(102, 126, 234, 0.15)",
446
+ zerolinecolor="rgba(102, 126, 234, 0.25)",
447
+ tickfont=dict(color="#c4b5fd"),
448
+ title_font=dict(color="#e0e0e0"),
449
+ ),
450
+ yaxis=dict(
451
+ gridcolor="rgba(102, 126, 234, 0.15)",
452
+ zerolinecolor="rgba(102, 126, 234, 0.25)",
453
+ tickfont=dict(color="#c4b5fd"),
454
+ title_font=dict(color="#e0e0e0"),
455
+ ),
456
+ margin=dict(l=60, r=30, t=60, b=60),
457
+ )
458
+
459
+ # Gradient-like color sequence
460
+ CHART_COLORS = [
461
+ "#667eea", "#764ba2", "#f093fb", "#f5576c",
462
+ "#4facfe", "#00f2fe", "#43e97b", "#38f9d7",
463
+ "#fa709a", "#fee140", "#a18cd1", "#fbc2eb",
464
+ "#ff9a9e", "#fad0c4", "#ffecd2", "#fcb69f",
465
+ ]
466
+
467
+
468
+ # ---------------------------------------------------------------------------
469
+ # Display results
470
+ # ---------------------------------------------------------------------------
471
+ if "result" in st.session_state and st.session_state.result is not None:
472
+ result = st.session_state.result
473
+
474
+ if result.status == "failed":
475
+ st.error(f"Pipeline failed with errors: {result.errors}")
476
+ st.stop()
477
+
478
+ # ---- Summary Statistics ----
479
+ st.markdown('<div class="section-header">📊 Summary Statistics</div>', unsafe_allow_html=True)
480
+
481
+ c1, c2, c3, c4, c5 = st.columns(5)
482
+ with c1:
483
+ st.markdown(f"""
484
+ <div class="stat-card">
485
+ <div class="stat-value">{len(result.title_topics)}</div>
486
+ <div class="stat-label">Title Topics</div>
487
+ </div>
488
+ """, unsafe_allow_html=True)
489
+ with c2:
490
+ st.markdown(f"""
491
+ <div class="stat-card">
492
+ <div class="stat-value">{len(result.abstract_topics)}</div>
493
+ <div class="stat-label">Abstract Topics</div>
494
+ </div>
495
+ """, unsafe_allow_html=True)
496
+ with c3:
497
+ st.markdown(f"""
498
+ <div class="stat-card">
499
+ <div class="stat-value">{len(result.combined_topics)}</div>
500
+ <div class="stat-label">Total Topics</div>
501
+ </div>
502
+ """, unsafe_allow_html=True)
503
+ with c4:
504
+ mapped_count = result.taxonomy_map.get("metadata", {}).get("mapped_count", 0)
505
+ st.markdown(f"""
506
+ <div class="stat-card">
507
+ <div class="stat-value">{mapped_count}</div>
508
+ <div class="stat-label">Mapped Themes</div>
509
+ </div>
510
+ """, unsafe_allow_html=True)
511
+ with c5:
512
+ novel_count = result.taxonomy_map.get("metadata", {}).get("novel_count", 0)
513
+ st.markdown(f"""
514
+ <div class="stat-card">
515
+ <div class="stat-value">{novel_count}</div>
516
+ <div class="stat-label">Novel Themes</div>
517
+ </div>
518
+ """, unsafe_allow_html=True)
519
+
520
+ st.markdown("<br>", unsafe_allow_html=True)
521
+
522
+ # ---- Tabbed Results ----
523
+ tab1, tab2, tab3, tab4, tab5, tab_review, tab_charts = st.tabs([
524
+ "📋 Topics Table",
525
+ "🔬 Title Topics",
526
+ "📄 Abstract Topics",
527
+ "⚖️ Theme Comparison",
528
+ "🗺️ Taxonomy Map",
529
+ "✏️ Review Table",
530
+ "📈 Charts",
531
+ ])
532
+
533
+ # Tab 1: Combined Topics Table
534
+ with tab1:
535
+ st.markdown('<div class="section-header">Combined Topics Table</div>', unsafe_allow_html=True)
536
+ st.markdown(f"Showing all **{len(result.combined_topics)}** topics from both Titles and Abstracts.")
537
+
538
+ # Filter controls
539
+ fcol1, fcol2 = st.columns(2)
540
+ with fcol1:
541
+ source_filter = st.multiselect(
542
+ "Filter by Source",
543
+ options=result.combined_topics["source"].unique().tolist(),
544
+ default=result.combined_topics["source"].unique().tolist(),
545
+ )
546
+ with fcol2:
547
+ search_term = st.text_input("🔍 Search keywords", "")
548
+
549
+ display_df = result.combined_topics[result.combined_topics["source"].isin(source_filter)]
550
+ if search_term:
551
+ mask = display_df["keywords"].str.contains(search_term, case=False, na=False)
552
+ mask |= display_df["label"].str.contains(search_term, case=False, na=False)
553
+ display_df = display_df[mask]
554
+
555
+ st.dataframe(
556
+ display_df,
557
+ use_container_width=True,
558
+ height=500,
559
+ column_config={
560
+ "topic_id": st.column_config.NumberColumn("Topic ID", width="small"),
561
+ "keywords": st.column_config.TextColumn("Keywords", width="large"),
562
+ "label": st.column_config.TextColumn("Label", width="medium"),
563
+ "source": st.column_config.TextColumn("Source", width="small"),
564
+ },
565
+ )
566
+
567
+ # Tab 2: Title Topics
568
+ with tab2:
569
+ st.markdown('<div class="section-header">Title Topics</div>', unsafe_allow_html=True)
570
+ st.markdown(f"**{len(result.title_topics)}** topics discovered from paper titles.")
571
+ st.dataframe(result.title_topics, use_container_width=True, height=500)
572
+
573
+ # Tab 3: Abstract Topics
574
+ with tab3:
575
+ st.markdown('<div class="section-header">Abstract Topics</div>', unsafe_allow_html=True)
576
+ st.markdown(f"**{len(result.abstract_topics)}** topics discovered from paper abstracts.")
577
+ st.dataframe(result.abstract_topics, use_container_width=True, height=500)
578
+
579
+ # Tab 4: Theme Comparison
580
+ with tab4:
581
+ st.markdown('<div class="section-header">Theme Comparison: Titles vs Abstracts</div>', unsafe_allow_html=True)
582
+
583
+ if not result.comparison.empty:
584
+ # Alignment distribution
585
+ align_counts = result.comparison["alignment"].value_counts()
586
+ acol1, acol2, acol3, acol4 = st.columns(4)
587
+ for col, alignment in zip(
588
+ [acol1, acol2, acol3, acol4],
589
+ ["Strong", "Moderate", "Weak", "No Match"],
590
+ ):
591
+ with col:
592
+ count = align_counts.get(alignment, 0)
593
+ st.metric(label=f"{alignment} Alignment", value=count)
594
+
595
+ st.markdown("<br>", unsafe_allow_html=True)
596
+
597
+ # Filter by alignment
598
+ alignment_filter = st.multiselect(
599
+ "Filter by Alignment",
600
+ options=["Strong", "Moderate", "Weak", "No Match"],
601
+ default=["Strong", "Moderate", "Weak", "No Match"],
602
+ )
603
+ filtered_comp = result.comparison[result.comparison["alignment"].isin(alignment_filter)]
604
+
605
+ st.dataframe(
606
+ filtered_comp,
607
+ use_container_width=True,
608
+ height=500,
609
+ column_config={
610
+ "similarity": st.column_config.ProgressColumn(
611
+ "Similarity",
612
+ min_value=0,
613
+ max_value=1,
614
+ format="%.2f",
615
+ ),
616
+ },
617
+ )
618
+ else:
619
+ st.info("No comparison data available.")
620
+
621
+ # Tab 5: Taxonomy Map
622
+ with tab5:
623
+ st.markdown('<div class="section-header">Taxonomy Map</div>', unsafe_allow_html=True)
624
+
625
+ taxonomy = result.taxonomy_map
626
+ meta = taxonomy.get("metadata", {})
627
+
628
+ st.markdown(f"""
629
+ <div class="info-box">
630
+ <strong>Classification Summary:</strong><br>
631
+ Total Topics: <strong>{meta.get('total_topics', 0)}</strong> |
632
+ <span class="mapped-badge">MAPPED: {meta.get('mapped_count', 0)}</span> |
633
+ <span class="novel-badge">NOVEL: {meta.get('novel_count', 0)}</span> |
634
+ Threshold: {meta.get('threshold', 0.15)}
635
+ </div>
636
+ """, unsafe_allow_html=True)
637
+
638
+ tax_tab1, tax_tab2 = st.tabs(["✅ Mapped Themes", "🆕 Novel Themes"])
639
+
640
+ with tax_tab1:
641
+ mapped_list = taxonomy.get("mapped", [])
642
+ if mapped_list:
643
+ mapped_df = pd.DataFrame(mapped_list)
644
+ st.dataframe(
645
+ mapped_df,
646
+ use_container_width=True,
647
+ height=400,
648
+ column_config={
649
+ "score": st.column_config.ProgressColumn(
650
+ "Match Score",
651
+ min_value=0,
652
+ max_value=1,
653
+ format="%.3f",
654
+ ),
655
+ },
656
+ )
657
+ else:
658
+ st.info("No mapped themes found.")
659
+
660
+ with tax_tab2:
661
+ novel_list = taxonomy.get("novel", [])
662
+ if novel_list:
663
+ novel_df = pd.DataFrame(novel_list)
664
+ st.dataframe(
665
+ novel_df,
666
+ use_container_width=True,
667
+ height=400,
668
+ column_config={
669
+ "score": st.column_config.ProgressColumn(
670
+ "Match Score",
671
+ min_value=0,
672
+ max_value=1,
673
+ format="%.3f",
674
+ ),
675
+ },
676
+ )
677
+ else:
678
+ st.info("No novel themes found.")
679
+
680
+ # ==================================================================
681
+ # Tab 6: Editable Review Table
682
+ # ==================================================================
683
+ with tab_review:
684
+ st.markdown("""
685
+ <div class="review-header">
686
+ <h3>✏️ Topic Review Table</h3>
687
+ <p>Review, approve, rename, and annotate each topic. Changes are saved to outputs/review_table.csv.</p>
688
+ </div>
689
+ """, unsafe_allow_html=True)
690
+
691
+ # Build review dataframe from combined topics
692
+ # Load existing review table if available to preserve edits
693
+ review_csv_path = os.path.join(OUTPUT_DIR, "review_table.csv")
694
+
695
+ if "review_df" not in st.session_state:
696
+ if os.path.exists(review_csv_path):
697
+ # Load previously saved review table
698
+ existing_review = pd.read_csv(review_csv_path)
699
+ # Merge with current topics to ensure all topics are represented
700
+ current_ids = set(result.combined_topics["topic_id"].tolist())
701
+ existing_ids = set(existing_review["topic_id"].tolist()) if "topic_id" in existing_review.columns else set()
702
+
703
+ if current_ids == existing_ids or existing_ids.issuperset(current_ids):
704
+ st.session_state.review_df = existing_review
705
+ else:
706
+ # Rebuild from current topics, but preserve existing edits
707
+ review_data = []
708
+ for _, row in result.combined_topics.iterrows():
709
+ review_data.append({
710
+ "topic_id": int(row["topic_id"]),
711
+ "label": row.get("label", ""),
712
+ "keywords": row.get("keywords", ""),
713
+ "source": row.get("source", ""),
714
+ "approve": False,
715
+ "rename_to": "",
716
+ "reasoning": "",
717
+ })
718
+ new_review_df = pd.DataFrame(review_data)
719
+ # Merge existing edits
720
+ if not existing_review.empty and "topic_id" in existing_review.columns:
721
+ for _, erow in existing_review.iterrows():
722
+ mask = new_review_df["topic_id"] == erow["topic_id"]
723
+ if mask.any():
724
+ if "approve" in erow:
725
+ new_review_df.loc[mask, "approve"] = erow["approve"]
726
+ if "rename_to" in erow and pd.notna(erow["rename_to"]):
727
+ new_review_df.loc[mask, "rename_to"] = erow["rename_to"]
728
+ if "reasoning" in erow and pd.notna(erow["reasoning"]):
729
+ new_review_df.loc[mask, "reasoning"] = erow["reasoning"]
730
+ st.session_state.review_df = new_review_df
731
+ else:
732
+ # Build fresh review table
733
+ review_data = []
734
+ for _, row in result.combined_topics.iterrows():
735
+ review_data.append({
736
+ "topic_id": int(row["topic_id"]),
737
+ "label": row.get("label", ""),
738
+ "keywords": row.get("keywords", ""),
739
+ "source": row.get("source", ""),
740
+ "approve": False,
741
+ "rename_to": "",
742
+ "reasoning": "",
743
+ })
744
+ st.session_state.review_df = pd.DataFrame(review_data)
745
+
746
+ # Filter controls for review table
747
+ rv_col1, rv_col2, rv_col3 = st.columns(3)
748
+ with rv_col1:
749
+ review_source_filter = st.multiselect(
750
+ "Filter by Source",
751
+ options=st.session_state.review_df["source"].unique().tolist(),
752
+ default=st.session_state.review_df["source"].unique().tolist(),
753
+ key="review_source_filter",
754
+ )
755
+ with rv_col2:
756
+ review_search = st.text_input("🔍 Search in review table", "", key="review_search")
757
+ with rv_col3:
758
+ review_approval_filter = st.selectbox(
759
+ "Show",
760
+ options=["All Topics", "Approved Only", "Not Approved"],
761
+ index=0,
762
+ key="review_approval_filter",
763
+ )
764
+
765
+ # Apply filters
766
+ filtered_review = st.session_state.review_df[
767
+ st.session_state.review_df["source"].isin(review_source_filter)
768
+ ]
769
+ if review_search:
770
+ search_mask = (
771
+ filtered_review["keywords"].str.contains(review_search, case=False, na=False) |
772
+ filtered_review["label"].str.contains(review_search, case=False, na=False)
773
+ )
774
+ filtered_review = filtered_review[search_mask]
775
+ if review_approval_filter == "Approved Only":
776
+ filtered_review = filtered_review[filtered_review["approve"] == True]
777
+ elif review_approval_filter == "Not Approved":
778
+ filtered_review = filtered_review[filtered_review["approve"] == False]
779
+
780
+ # Editable data editor
781
+ edited_df = st.data_editor(
782
+ filtered_review,
783
+ use_container_width=True,
784
+ height=500,
785
+ num_rows="fixed",
786
+ key="review_editor",
787
+ column_config={
788
+ "topic_id": st.column_config.NumberColumn(
789
+ "Topic ID", width="small", disabled=True
790
+ ),
791
+ "label": st.column_config.TextColumn(
792
+ "Label", width="medium",
793
+ ),
794
+ "keywords": st.column_config.TextColumn(
795
+ "Keywords", width="large", disabled=True,
796
+ ),
797
+ "source": st.column_config.TextColumn(
798
+ "Source", width="small", disabled=True,
799
+ ),
800
+ "approve": st.column_config.CheckboxColumn(
801
+ "✅ Approve", width="small", default=False,
802
+ ),
803
+ "rename_to": st.column_config.TextColumn(
804
+ "Rename To", width="medium",
805
+ ),
806
+ "reasoning": st.column_config.TextColumn(
807
+ "Reasoning / Notes", width="large",
808
+ ),
809
+ },
810
+ column_order=["topic_id", "label", "keywords", "approve", "rename_to", "reasoning", "source"],
811
+ )
812
+
813
+ # Update session state with edits
814
+ if edited_df is not None:
815
+ # Merge edits back into the full review dataframe
816
+ for idx, erow in edited_df.iterrows():
817
+ mask = st.session_state.review_df.index == idx
818
+ if mask.any():
819
+ for col in ["label", "approve", "rename_to", "reasoning"]:
820
+ if col in erow:
821
+ st.session_state.review_df.loc[mask, col] = erow[col]
822
+
823
+ # Save button
824
+ sv_col1, sv_col2, sv_col3 = st.columns([1, 1, 2])
825
+ with sv_col1:
826
+ if st.button("💾 Save Review Table", use_container_width=True, type="primary"):
827
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
828
+ st.session_state.review_df.to_csv(review_csv_path, index=False)
829
+ st.markdown(
830
+ '<div class="save-confirm">✅ Review table saved to outputs/review_table.csv</div>',
831
+ unsafe_allow_html=True,
832
+ )
833
+ with sv_col2:
834
+ approved_count = int(st.session_state.review_df["approve"].sum()) if "approve" in st.session_state.review_df.columns else 0
835
+ total_count = len(st.session_state.review_df)
836
+ st.markdown(f"""
837
+ <div class="stat-card" style="padding: 0.75rem 1rem;">
838
+ <div class="stat-value" style="font-size: 1.4rem;">{approved_count}/{total_count}</div>
839
+ <div class="stat-label">Topics Approved</div>
840
+ </div>
841
+ """, unsafe_allow_html=True)
842
+
843
+ # ==================================================================
844
+ # Tab 7: Charts
845
+ # ==================================================================
846
+ with tab_charts:
847
+ st.markdown('<div class="section-header">📈 Topic Visualizations</div>', unsafe_allow_html=True)
848
+
849
+ # -----------------------------------------------------------
850
+ # Chart 1: Topic Frequency by Source
851
+ # -----------------------------------------------------------
852
+ st.markdown("#### 📊 Topic Frequency by Source")
853
+ st.caption("Number of topics discovered from each source (Titles vs Abstracts).")
854
+
855
+ source_counts = result.combined_topics["source"].value_counts().reset_index()
856
+ source_counts.columns = ["Source", "Count"]
857
+
858
+ fig1 = px.bar(
859
+ source_counts,
860
+ x="Source",
861
+ y="Count",
862
+ color="Source",
863
+ color_discrete_sequence=["#667eea", "#764ba2"],
864
+ text="Count",
865
+ )
866
+ fig1.update_traces(
867
+ textposition="outside",
868
+ textfont=dict(color="#e0e0e0", size=14, family="Inter"),
869
+ marker=dict(
870
+ line=dict(width=0),
871
+ ),
872
+ )
873
+ fig1.update_layout(
874
+ **PLOTLY_LAYOUT,
875
+ title="Topic Count by Source",
876
+ xaxis_title="Source",
877
+ yaxis_title="Number of Topics",
878
+ showlegend=False,
879
+ height=420,
880
+ )
881
+ st.plotly_chart(fig1, use_container_width=True)
882
+
883
+ st.markdown("---")
884
+
885
+ # -----------------------------------------------------------
886
+ # Chart 2: Top Keywords Across All Topics
887
+ # -----------------------------------------------------------
888
+ st.markdown("#### 🔤 Top Keywords Across All Topics")
889
+ st.caption("Most frequently occurring keywords across all discovered topics.")
890
+
891
+ # Extract all keywords, count frequencies
892
+ all_keywords = []
893
+ for kw_str in result.combined_topics["keywords"].dropna():
894
+ for kw in kw_str.split(","):
895
+ kw_clean = kw.strip().lower()
896
+ if kw_clean and len(kw_clean) > 2:
897
+ all_keywords.append(kw_clean)
898
+
899
+ kw_counts = pd.Series(all_keywords).value_counts().head(25).reset_index()
900
+ kw_counts.columns = ["Keyword", "Frequency"]
901
+
902
+ fig2 = px.bar(
903
+ kw_counts,
904
+ x="Frequency",
905
+ y="Keyword",
906
+ orientation="h",
907
+ color="Frequency",
908
+ color_continuous_scale=["#302b63", "#667eea", "#f093fb", "#f5576c"],
909
+ )
910
+ fig2.update_traces(
911
+ marker=dict(line=dict(width=0)),
912
+ )
913
+ fig2.update_layout(
914
+ **PLOTLY_LAYOUT,
915
+ title="Top 25 Keywords by Frequency",
916
+ xaxis_title="Frequency (across all topics)",
917
+ yaxis_title="",
918
+ height=700,
919
+ coloraxis_colorbar=dict(
920
+ title="Freq",
921
+ tickfont=dict(color="#c4b5fd"),
922
+ title_font=dict(color="#e0e0e0"),
923
+ ),
924
+ )
925
+ # Override yaxis separately to avoid duplicate keyword with PLOTLY_LAYOUT
926
+ fig2.update_layout(
927
+ yaxis=dict(
928
+ autorange="reversed",
929
+ gridcolor="rgba(102, 126, 234, 0.1)",
930
+ tickfont=dict(color="#c4b5fd", size=12),
931
+ ),
932
+ )
933
+ st.plotly_chart(fig2, use_container_width=True)
934
+
935
+ st.markdown("---")
936
+
937
+ # -----------------------------------------------------------
938
+ # Chart 3: Taxonomy Distribution (Mapped vs Novel)
939
+ # -----------------------------------------------------------
940
+ st.markdown("#### 🧬 Taxonomy Classification Distribution")
941
+ st.caption("How topics are classified against the known research taxonomy.")
942
+
943
+ tax_meta = result.taxonomy_map.get("metadata", {})
944
+ tax_data = pd.DataFrame({
945
+ "Classification": ["MAPPED", "NOVEL"],
946
+ "Count": [tax_meta.get("mapped_count", 0), tax_meta.get("novel_count", 0)],
947
+ })
948
+
949
+ chart3_col1, chart3_col2 = st.columns(2)
950
+
951
+ with chart3_col1:
952
+ fig3a = px.pie(
953
+ tax_data,
954
+ values="Count",
955
+ names="Classification",
956
+ color="Classification",
957
+ color_discrete_map={
958
+ "MAPPED": "#38ef7d",
959
+ "NOVEL": "#f7b733",
960
+ },
961
+ hole=0.55,
962
+ )
963
+ fig3a.update_traces(
964
+ textfont=dict(color="#ffffff", size=14),
965
+ textinfo="percent+label",
966
+ marker=dict(line=dict(color="#1a1a2e", width=3)),
967
+ )
968
+ fig3a.update_layout(
969
+ paper_bgcolor="rgba(26, 26, 46, 0.95)",
970
+ plot_bgcolor="rgba(22, 33, 62, 0.95)",
971
+ font=dict(family="Inter, sans-serif", size=13, color="#e0e0e0"),
972
+ title=dict(text="Mapped vs Novel", font=dict(size=16, color="#ffffff")),
973
+ legend=dict(font=dict(color="#e0e0e0")),
974
+ height=380,
975
+ margin=dict(l=20, r=20, t=50, b=20),
976
+ )
977
+ st.plotly_chart(fig3a, use_container_width=True)
978
+
979
+ with chart3_col2:
980
+ fig3b = px.bar(
981
+ tax_data,
982
+ x="Classification",
983
+ y="Count",
984
+ color="Classification",
985
+ color_discrete_map={
986
+ "MAPPED": "#38ef7d",
987
+ "NOVEL": "#f7b733",
988
+ },
989
+ text="Count",
990
+ )
991
+ fig3b.update_traces(
992
+ textposition="outside",
993
+ textfont=dict(color="#e0e0e0", size=16, family="Inter"),
994
+ marker=dict(line=dict(width=0)),
995
+ )
996
+ fig3b.update_layout(
997
+ **PLOTLY_LAYOUT,
998
+ title="Classification Count",
999
+ xaxis_title="",
1000
+ yaxis_title="Number of Topics",
1001
+ showlegend=False,
1002
+ height=380,
1003
+ )
1004
+ st.plotly_chart(fig3b, use_container_width=True)
1005
+
1006
+ st.markdown("---")
1007
+
1008
+ # -----------------------------------------------------------
1009
+ # Chart 4: Alignment Distribution (from comparisons)
1010
+ # -----------------------------------------------------------
1011
+ if not result.comparison.empty:
1012
+ st.markdown("#### ⚖️ Theme Alignment Distribution")
1013
+ st.caption("Distribution of alignment strength between Title and Abstract topics.")
1014
+
1015
+ alignment_data = result.comparison["alignment"].value_counts().reset_index()
1016
+ alignment_data.columns = ["Alignment", "Count"]
1017
+
1018
+ # Define order and colors
1019
+ align_order = ["Strong", "Moderate", "Weak", "No Match"]
1020
+ align_colors = {
1021
+ "Strong": "#38ef7d",
1022
+ "Moderate": "#4facfe",
1023
+ "Weak": "#f7971e",
1024
+ "No Match": "#f5576c",
1025
+ }
1026
+
1027
+ fig4 = px.bar(
1028
+ alignment_data,
1029
+ x="Alignment",
1030
+ y="Count",
1031
+ color="Alignment",
1032
+ color_discrete_map=align_colors,
1033
+ text="Count",
1034
+ category_orders={"Alignment": align_order},
1035
+ )
1036
+ fig4.update_traces(
1037
+ textposition="outside",
1038
+ textfont=dict(color="#e0e0e0", size=14, family="Inter"),
1039
+ marker=dict(line=dict(width=0)),
1040
+ )
1041
+ fig4.update_layout(
1042
+ **PLOTLY_LAYOUT,
1043
+ title="Title ↔ Abstract Alignment Distribution",
1044
+ xaxis_title="Alignment Level",
1045
+ yaxis_title="Number of Topic Pairs",
1046
+ showlegend=False,
1047
+ height=420,
1048
+ )
1049
+ st.plotly_chart(fig4, use_container_width=True)
1050
+
1051
+ st.markdown("---")
1052
+
1053
+ # -----------------------------------------------------------
1054
+ # Chart 5: Similarity Score Histogram
1055
+ # -----------------------------------------------------------
1056
+ st.markdown("#### 📐 Similarity Score Distribution")
1057
+ st.caption("Distribution of Jaccard similarity scores between matched Title and Abstract topics.")
1058
+
1059
+ fig5 = px.histogram(
1060
+ result.comparison,
1061
+ x="similarity",
1062
+ nbins=30,
1063
+ color_discrete_sequence=["#667eea"],
1064
+ marginal="box",
1065
+ )
1066
+ fig5.update_traces(
1067
+ marker=dict(
1068
+ line=dict(width=1, color="#b8b5ff"),
1069
+ ),
1070
+ selector=dict(type="histogram"),
1071
+ )
1072
+ fig5.update_layout(
1073
+ **PLOTLY_LAYOUT,
1074
+ title="Similarity Score Histogram",
1075
+ xaxis_title="Jaccard Similarity Score",
1076
+ yaxis_title="Count",
1077
+ height=420,
1078
+ bargap=0.05,
1079
+ )
1080
+ st.plotly_chart(fig5, use_container_width=True)
1081
+
1082
+ # ---- Downloads Section ----
1083
+ st.markdown('<div class="section-header">📥 Download Outputs</div>', unsafe_allow_html=True)
1084
+
1085
+ dcol1, dcol2, dcol3, dcol4 = st.columns(4)
1086
+
1087
+ with dcol1:
1088
+ csv_data = result.combined_topics.to_csv(index=False)
1089
+ st.download_button(
1090
+ "⬇️ Topics Table (CSV)",
1091
+ data=csv_data,
1092
+ file_name="topics_table.csv",
1093
+ mime="text/csv",
1094
+ use_container_width=True,
1095
+ )
1096
+
1097
+ with dcol2:
1098
+ comp_data = result.comparison.to_csv(index=False)
1099
+ st.download_button(
1100
+ "⬇️ Comparison (CSV)",
1101
+ data=comp_data,
1102
+ file_name="comparison.csv",
1103
+ mime="text/csv",
1104
+ use_container_width=True,
1105
+ )
1106
+
1107
+ with dcol3:
1108
+ json_data = json.dumps(result.taxonomy_map, indent=2, ensure_ascii=False)
1109
+ st.download_button(
1110
+ "⬇️ Taxonomy Map (JSON)",
1111
+ data=json_data,
1112
+ file_name="taxonomy_map.json",
1113
+ mime="application/json",
1114
+ use_container_width=True,
1115
+ )
1116
+
1117
+ with dcol4:
1118
+ # Download review table if it exists
1119
+ review_path = os.path.join(OUTPUT_DIR, "review_table.csv")
1120
+ if os.path.exists(review_path):
1121
+ with open(review_path, "r") as f:
1122
+ review_data = f.read()
1123
+ st.download_button(
1124
+ "⬇️ Review Table (CSV)",
1125
+ data=review_data,
1126
+ file_name="review_table.csv",
1127
+ mime="text/csv",
1128
+ use_container_width=True,
1129
+ )
1130
+ else:
1131
+ st.download_button(
1132
+ "⬇️ Review Table (CSV)",
1133
+ data="Not saved yet. Go to Review Table tab and click Save.",
1134
+ file_name="review_table.csv",
1135
+ mime="text/csv",
1136
+ use_container_width=True,
1137
+ disabled=True,
1138
+ )
1139
+
1140
+ # ---- Auto-save comparison.csv and taxonomy_map.json to outputs ----
1141
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
1142
+ result.comparison.to_csv(os.path.join(OUTPUT_DIR, "comparison.csv"), index=False)
1143
+ with open(os.path.join(OUTPUT_DIR, "taxonomy_map.json"), "w", encoding="utf-8") as f:
1144
+ json.dump(result.taxonomy_map, f, indent=2, ensure_ascii=False)
1145
+
1146
+ # ---- Pipeline Log ----
1147
+ with st.expander("📜 Pipeline Execution Log"):
1148
+ st.markdown(f"**Status:** `{result.status}`")
1149
+ st.markdown(f"**Steps Completed:** {len(result.steps_completed)}/9")
1150
+ for i, step in enumerate(result.steps_completed, 1):
1151
+ st.markdown(f" ✅ Step {i}: `{step}`")
1152
+ if result.errors:
1153
+ st.markdown("**Errors:**")
1154
+ for err in result.errors:
1155
+ st.error(err)
1156
+ st.markdown("**Exported Files:**")
1157
+ for name, path in result.exported_files.items():
1158
+ st.markdown(f" 📄 `{name}` → `{path}`")
1159
+
1160
+ else:
1161
+ # ---- Welcome / instructions when no results ----
1162
+ st.markdown("""
1163
+ <div class="info-box">
1164
+ <strong>👋 Welcome!</strong><br><br>
1165
+ This application uses an AI agent to perform comprehensive topic modeling on research papers.
1166
+ <br><br>
1167
+ <strong>How to use:</strong><br>
1168
+ 1️⃣ Upload a CSV file with <code>Title</code> and <code>Abstract</code> columns (or use the default dataset).<br>
1169
+ 2️⃣ Configure the minimum number of topics and label generation method in the sidebar.<br>
1170
+ 3️⃣ Click <strong>"🚀 Run Topic Modeling Agent"</strong> to start the analysis.<br>
1171
+ 4️⃣ Explore topics, comparisons, and taxonomy classification in the results tabs.<br>
1172
+ 5️⃣ Review and annotate topics in the <strong>✏️ Review Table</strong> tab.<br>
1173
+ 6️⃣ View interactive charts in the <strong>📈 Charts</strong> tab.<br>
1174
+ 7️⃣ Download all outputs as CSV and JSON files.
1175
+ </div>
1176
+ """, unsafe_allow_html=True)
1177
+
1178
+ st.markdown("<br>", unsafe_allow_html=True)
1179
+
1180
+ # Show a preview if default dataset exists
1181
+ if os.path.exists("dataset.csv"):
1182
+ with st.expander("👀 Preview Default Dataset", expanded=False):
1183
+ try:
1184
+ preview_df = pd.read_csv("dataset.csv", nrows=10)
1185
+ st.markdown(f"**Columns:** {', '.join(preview_df.columns.tolist())}")
1186
+ if "Title" in preview_df.columns:
1187
+ st.dataframe(preview_df[["Title", "Abstract"]].head(10) if "Abstract" in preview_df.columns else preview_df[["Title"]].head(10), use_container_width=True)
1188
+ else:
1189
+ st.dataframe(preview_df.head(10), use_container_width=True)
1190
+ except Exception as e:
1191
+ st.warning(f"Could not preview dataset: {e}")
dataset.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a602dfcb3982c58156c67f4fb2565cc8ec9b4b2368a1b6ad4be3c621c1232218
3
+ size 28342399
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Requirements for the AI Topic Modeling Agent
2
+ # 13 packages as specified
3
+
4
+ streamlit>=1.30.0
5
+ pandas>=2.0.0
6
+ numpy>=1.24.0
7
+ scikit-learn>=1.3.0
8
+ nltk>=3.8.0
9
+ bertopic>=0.16.0
10
+ umap-learn>=0.5.4
11
+ hdbscan>=0.8.33
12
+ sentence-transformers>=2.2.0
13
+ groq>=0.4.0
14
+ plotly>=5.18.0
15
+ scipy>=1.11.0
16
+ joblib>=1.3.0
tools.py ADDED
@@ -0,0 +1,626 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ tools.py — Core functions for the AI-driven topic modeling pipeline.
3
+
4
+ This module provides all analytical functions used by the TopicAgent:
5
+ - CSV ingestion and validation
6
+ - Text preprocessing (lowercasing, stopword removal, cleaning)
7
+ - Topic modeling via BERTopic (with fallback to sklearn LDA)
8
+ - Automatic human-readable label generation
9
+ - Cross-source theme comparison (Title vs Abstract)
10
+ - Taxonomy mapping (MAPPED / NOVEL classification)
11
+ """
12
+
13
+ import re
14
+ import json
15
+ import logging
16
+ from typing import Dict, List, Tuple, Optional, Any
17
+
18
+ import numpy as np
19
+ import pandas as pd
20
+ import nltk
21
+ from nltk.corpus import stopwords
22
+ from nltk.tokenize import word_tokenize
23
+
24
+ # ---------------------------------------------------------------------------
25
+ # Logging
26
+ # ---------------------------------------------------------------------------
27
+ logger = logging.getLogger(__name__)
28
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
29
+
30
+ # ---------------------------------------------------------------------------
31
+ # NLTK data download (idempotent)
32
+ # ---------------------------------------------------------------------------
33
+ for _resource in ("punkt", "punkt_tab", "stopwords"):
34
+ try:
35
+ nltk.data.find(f"tokenizers/{_resource}" if "punkt" in _resource else f"corpora/{_resource}")
36
+ except LookupError:
37
+ nltk.download(_resource, quiet=True)
38
+
39
+ # ---------------------------------------------------------------------------
40
+ # Reference taxonomy of known AI / business / research themes
41
+ # Used by create_taxonomy_map() for MAPPED vs NOVEL classification
42
+ # ---------------------------------------------------------------------------
43
+ KNOWN_THEMES: List[str] = [
44
+ # AI / ML
45
+ "artificial intelligence", "machine learning", "deep learning", "neural network",
46
+ "natural language processing", "computer vision", "reinforcement learning",
47
+ "generative ai", "large language model", "transformer", "chatbot",
48
+ "recommendation system", "knowledge graph", "robotics", "autonomous",
49
+ "explainable ai", "federated learning", "transfer learning", "ai ethics",
50
+ "adversarial", "gan", "diffusion model", "prompt engineering",
51
+ # Data science
52
+ "data mining", "big data", "analytics", "data science", "data quality",
53
+ "feature engineering", "dimensionality reduction", "clustering", "classification",
54
+ "regression", "time series", "anomaly detection", "sentiment analysis",
55
+ # Business / Management
56
+ "digital transformation", "innovation", "strategy", "supply chain",
57
+ "customer experience", "marketing", "e-commerce", "fintech", "blockchain",
58
+ "sustainability", "corporate social responsibility", "knowledge management",
59
+ "decision support", "business intelligence", "enterprise", "organizational",
60
+ "human resource", "leadership", "entrepreneurship", "business model",
61
+ # Information systems
62
+ "information systems", "technology adoption", "user acceptance", "privacy",
63
+ "security", "trust", "social media", "online community", "platform",
64
+ "crowdsourcing", "cloud computing", "iot", "internet of things",
65
+ "software engineering", "agile", "devops", "digital platform",
66
+ # Healthcare / Society
67
+ "healthcare", "telemedicine", "electronic health", "public health",
68
+ "education", "e-learning", "smart city", "government", "policy",
69
+ "ethics", "fairness", "bias", "misinformation", "content moderation",
70
+ # Research methods
71
+ "survey", "experiment", "case study", "meta-analysis", "bibliometric",
72
+ "systematic review", "structural equation", "grounded theory",
73
+ ]
74
+
75
+
76
+ # ===================================================================
77
+ # 1. load_csv — Ingest and validate the CSV dataset
78
+ # ===================================================================
79
+ def load_csv(filepath: str) -> pd.DataFrame:
80
+ """
81
+ Load a CSV file and ensure the required columns (Title, Abstract) exist.
82
+
83
+ Parameters
84
+ ----------
85
+ filepath : str
86
+ Path to the CSV file.
87
+
88
+ Returns
89
+ -------
90
+ pd.DataFrame
91
+ DataFrame with at least 'Title' and 'Abstract' columns.
92
+
93
+ Raises
94
+ ------
95
+ FileNotFoundError
96
+ If the specified file does not exist.
97
+ ValueError
98
+ If required columns are missing.
99
+ """
100
+ logger.info("Loading CSV from %s", filepath)
101
+ df = pd.read_csv(filepath, encoding="utf-8", on_bad_lines="skip")
102
+ logger.info("Loaded %d rows × %d columns", len(df), len(df.columns))
103
+
104
+ # Validate required columns (case-insensitive match)
105
+ col_map = {c.strip().lower(): c for c in df.columns}
106
+ required = {"title", "abstract"}
107
+ missing = required - set(col_map.keys())
108
+ if missing:
109
+ raise ValueError(f"CSV is missing required columns: {missing}. Found: {list(df.columns)}")
110
+
111
+ # Rename to canonical form
112
+ df = df.rename(columns={col_map["title"]: "Title", col_map["abstract"]: "Abstract"})
113
+
114
+ # Drop rows where both Title and Abstract are empty
115
+ df = df.dropna(subset=["Title", "Abstract"], how="all").reset_index(drop=True)
116
+ df["Title"] = df["Title"].fillna("")
117
+ df["Abstract"] = df["Abstract"].fillna("")
118
+
119
+ logger.info("After cleaning: %d usable rows", len(df))
120
+ return df
121
+
122
+
123
+ # ===================================================================
124
+ # 2. preprocess_text — Clean and normalise a list of text documents
125
+ # ===================================================================
126
+ def preprocess_text(documents: List[str]) -> List[str]:
127
+ """
128
+ Apply professional-grade text preprocessing:
129
+ 1. Lowercase
130
+ 2. Remove URLs, emails, special characters, digits
131
+ 3. Tokenize
132
+ 4. Remove stopwords (NLTK English)
133
+ 5. Remove very short tokens (length ≤ 2)
134
+ 6. Rejoin into cleaned strings
135
+
136
+ Parameters
137
+ ----------
138
+ documents : list of str
139
+ Raw text documents.
140
+
141
+ Returns
142
+ -------
143
+ list of str
144
+ Cleaned text documents.
145
+ """
146
+ stop_words = set(stopwords.words("english"))
147
+ # Extended stopwords common in academic abstracts
148
+ stop_words.update([
149
+ "©", "elsevier", "rights", "reserved", "doi", "http", "https",
150
+ "vol", "pp", "fig", "table", "journal", "author", "authors",
151
+ "study", "paper", "research", "results", "findings", "however",
152
+ "propose", "proposed", "approach", "using", "based", "also",
153
+ "show", "shows", "shown", "may", "used", "use", "one", "two",
154
+ "three", "new", "well", "within", "among", "across", "toward",
155
+ "towards", "et", "al", "ie", "eg", "cf", "thus", "therefore",
156
+ "moreover", "furthermore", "addition", "conclusion", "conclusions",
157
+ ])
158
+
159
+ cleaned: List[str] = []
160
+ for doc in documents:
161
+ if not isinstance(doc, str) or not doc.strip():
162
+ cleaned.append("")
163
+ continue
164
+
165
+ text = doc.lower()
166
+ # Remove URLs
167
+ text = re.sub(r"https?://\S+|www\.\S+", " ", text)
168
+ # Remove emails
169
+ text = re.sub(r"\S+@\S+", " ", text)
170
+ # Remove digits and special characters but keep spaces
171
+ text = re.sub(r"[^a-z\s]", " ", text)
172
+ # Collapse whitespace
173
+ text = re.sub(r"\s+", " ", text).strip()
174
+
175
+ # Tokenize and filter
176
+ tokens = word_tokenize(text)
177
+ tokens = [t for t in tokens if t not in stop_words and len(t) > 2]
178
+
179
+ cleaned.append(" ".join(tokens))
180
+
181
+ logger.info("Preprocessed %d documents", len(cleaned))
182
+ return cleaned
183
+
184
+
185
+ # ===================================================================
186
+ # 3. run_topic_modeling — Discover topics via BERTopic (or LDA fallback)
187
+ # ===================================================================
188
+ def run_topic_modeling(
189
+ documents: List[str],
190
+ source_label: str = "documents",
191
+ min_topics: int = 100,
192
+ use_bertopic: bool = True,
193
+ ) -> Tuple[pd.DataFrame, Any]:
194
+ """
195
+ Perform topic modeling on a corpus of preprocessed documents.
196
+
197
+ Strategy:
198
+ 1. Try BERTopic with UMAP + HDBSCAN. If the result has < min_topics,
199
+ automatically fall back to sklearn LDA.
200
+ 2. LDA is configured with n_components = min_topics to guarantee the
201
+ requested topic count.
202
+
203
+ Parameters
204
+ ----------
205
+ documents : list of str
206
+ Preprocessed text documents.
207
+ source_label : str
208
+ Label for logging (e.g. "Titles" or "Abstracts").
209
+ min_topics : int
210
+ Minimum number of topics required (default 100).
211
+ use_bertopic : bool
212
+ Whether to attempt BERTopic first.
213
+
214
+ Returns
215
+ -------
216
+ topics_df : pd.DataFrame
217
+ Columns: topic_id, keywords (comma-separated), representative_docs
218
+ model : object
219
+ The fitted topic model for downstream inspection.
220
+ """
221
+ # Filter out empty documents
222
+ valid_docs = [d for d in documents if d.strip()]
223
+ if len(valid_docs) < 20:
224
+ raise ValueError(f"Not enough valid documents ({len(valid_docs)}) for topic modeling.")
225
+
226
+ logger.info("Running topic modeling on %d %s (target ≥ %d topics)", len(valid_docs), source_label, min_topics)
227
+
228
+ topics_df = None
229
+ model = None
230
+
231
+ # ------ Attempt BERTopic ------
232
+ if use_bertopic:
233
+ try:
234
+ topics_df, model = _run_bertopic(valid_docs, source_label, min_topics)
235
+ except Exception as exc:
236
+ logger.warning("BERTopic failed (%s). Falling back to LDA.", exc)
237
+ topics_df = None
238
+
239
+ # ------ Fallback to LDA if needed ------
240
+ if topics_df is None or len(topics_df) < min_topics:
241
+ logger.info("Using LDA to guarantee ≥ %d topics for %s", min_topics, source_label)
242
+ topics_df, model = _run_lda(valid_docs, source_label, min_topics)
243
+
244
+ logger.info("Topic modeling complete for %s: %d topics discovered", source_label, len(topics_df))
245
+ return topics_df, model
246
+
247
+
248
+ def _run_bertopic(docs: List[str], source_label: str, min_topics: int):
249
+ """Run BERTopic with tuned parameters."""
250
+ from bertopic import BERTopic
251
+ from umap import UMAP
252
+ from hdbscan import HDBSCAN
253
+ from sklearn.feature_extraction.text import CountVectorizer
254
+
255
+ umap_model = UMAP(
256
+ n_neighbors=10,
257
+ n_components=5,
258
+ min_dist=0.0,
259
+ metric="cosine",
260
+ random_state=42,
261
+ )
262
+ hdbscan_model = HDBSCAN(
263
+ min_cluster_size=5,
264
+ min_samples=2,
265
+ prediction_data=True,
266
+ )
267
+ vectorizer = CountVectorizer(
268
+ stop_words="english",
269
+ ngram_range=(1, 2),
270
+ max_df=0.90,
271
+ min_df=2,
272
+ )
273
+
274
+ topic_model = BERTopic(
275
+ umap_model=umap_model,
276
+ hdbscan_model=hdbscan_model,
277
+ vectorizer_model=vectorizer,
278
+ nr_topics="auto",
279
+ top_n_words=10,
280
+ verbose=False,
281
+ )
282
+
283
+ topics, _probs = topic_model.fit_transform(docs)
284
+ info = topic_model.get_topic_info()
285
+ # Exclude outlier topic (-1)
286
+ info = info[info["Topic"] != -1].reset_index(drop=True)
287
+
288
+ rows = []
289
+ for _, row in info.iterrows():
290
+ tid = int(row["Topic"])
291
+ topic_words = topic_model.get_topic(tid)
292
+ kw = ", ".join([w for w, _ in topic_words[:10]])
293
+ rows.append({"topic_id": tid, "keywords": kw, "source": source_label})
294
+
295
+ df = pd.DataFrame(rows)
296
+ return df, topic_model
297
+
298
+
299
+ def _run_lda(docs: List[str], source_label: str, n_topics: int):
300
+ """Run sklearn LDA to guarantee the requested number of topics."""
301
+ from sklearn.decomposition import LatentDirichletAllocation
302
+ from sklearn.feature_extraction.text import CountVectorizer
303
+
304
+ vectorizer = CountVectorizer(
305
+ stop_words="english",
306
+ max_df=0.90,
307
+ min_df=2,
308
+ ngram_range=(1, 2),
309
+ max_features=10000,
310
+ )
311
+ dtm = vectorizer.fit_transform(docs)
312
+ feature_names = vectorizer.get_feature_names_out()
313
+
314
+ lda = LatentDirichletAllocation(
315
+ n_components=n_topics,
316
+ max_iter=25,
317
+ learning_method="online",
318
+ random_state=42,
319
+ n_jobs=-1,
320
+ )
321
+ lda.fit(dtm)
322
+
323
+ rows = []
324
+ for idx, component in enumerate(lda.components_):
325
+ top_indices = component.argsort()[-10:][::-1]
326
+ kw = ", ".join([feature_names[i] for i in top_indices])
327
+ rows.append({"topic_id": idx, "keywords": kw, "source": source_label})
328
+
329
+ df = pd.DataFrame(rows)
330
+ return df, lda
331
+
332
+
333
+ # ===================================================================
334
+ # 4. generate_labels — Create human-readable labels for each topic
335
+ # ===================================================================
336
+ def generate_labels(
337
+ topics_df: pd.DataFrame,
338
+ use_llm: bool = False,
339
+ groq_api_key: Optional[str] = None,
340
+ ) -> pd.DataFrame:
341
+ """
342
+ Generate a short human-readable label for every topic.
343
+
344
+ Strategy:
345
+ - If use_llm=True and a Groq API key is provided, use the Groq LLM
346
+ (llama-3.3-70b-versatile, free tier) to produce contextual labels.
347
+ - Otherwise, apply a heuristic: capitalise the first 3–4 keywords.
348
+
349
+ Parameters
350
+ ----------
351
+ topics_df : pd.DataFrame
352
+ Must contain columns 'topic_id' and 'keywords'.
353
+ use_llm : bool
354
+ Whether to use the Groq LLM for label generation.
355
+ groq_api_key : str, optional
356
+ Groq API key, required if use_llm is True.
357
+
358
+ Returns
359
+ -------
360
+ pd.DataFrame
361
+ Same DataFrame with an additional 'label' column.
362
+ """
363
+ if use_llm and groq_api_key:
364
+ logger.info("Generating labels using Groq LLM …")
365
+ topics_df = _generate_labels_llm(topics_df, groq_api_key)
366
+ else:
367
+ logger.info("Generating labels using keyword heuristic …")
368
+ topics_df = _generate_labels_heuristic(topics_df)
369
+
370
+ return topics_df
371
+
372
+
373
+ def _generate_labels_heuristic(df: pd.DataFrame) -> pd.DataFrame:
374
+ """Create labels from the top keywords of each topic."""
375
+ labels = []
376
+ for _, row in df.iterrows():
377
+ kws = [kw.strip() for kw in row["keywords"].split(",")]
378
+ # Take the first 3-4 non-trivial keywords and title-case them
379
+ candidates = [kw.title() for kw in kws if len(kw) > 2][:4]
380
+ label = " / ".join(candidates) if candidates else f"Topic {row['topic_id']}"
381
+ labels.append(label)
382
+ df = df.copy()
383
+ df["label"] = labels
384
+ return df
385
+
386
+
387
+ def _generate_labels_llm(df: pd.DataFrame, api_key: str) -> pd.DataFrame:
388
+ """Use Groq API to generate contextual labels for topics (batched)."""
389
+ import time
390
+ try:
391
+ from groq import Groq
392
+ except ImportError:
393
+ logger.warning("groq package not installed. Falling back to heuristic labels.")
394
+ return _generate_labels_heuristic(df)
395
+
396
+ client = Groq(api_key=api_key)
397
+ labels = []
398
+
399
+ # Process in batches to avoid rate limits
400
+ batch_size = 10
401
+ for batch_start in range(0, len(df), batch_size):
402
+ batch = df.iloc[batch_start:batch_start + batch_size]
403
+ prompt_lines = []
404
+ for _, row in batch.iterrows():
405
+ prompt_lines.append(f"Topic {row['topic_id']}: keywords = [{row['keywords']}]")
406
+
407
+ prompt = (
408
+ "You are a research taxonomy expert. For each topic below, "
409
+ "generate a concise, descriptive label (3-6 words) that captures "
410
+ "the theme of the keywords. Return ONLY a JSON list of objects "
411
+ 'with keys "topic_id" and "label". No extra text.\n\n'
412
+ + "\n".join(prompt_lines)
413
+ )
414
+
415
+ try:
416
+ chat = client.chat.completions.create(
417
+ model="llama-3.3-70b-versatile",
418
+ messages=[{"role": "user", "content": prompt}],
419
+ temperature=0.3,
420
+ max_tokens=1024,
421
+ )
422
+ resp = chat.choices[0].message.content.strip()
423
+ # Parse JSON from the response
424
+ # Find JSON array in response
425
+ json_match = re.search(r"\[.*\]", resp, re.DOTALL)
426
+ if json_match:
427
+ batch_labels = json.loads(json_match.group())
428
+ label_map = {item["topic_id"]: item["label"] for item in batch_labels}
429
+ for _, row in batch.iterrows():
430
+ labels.append(label_map.get(row["topic_id"], f"Topic {row['topic_id']}"))
431
+ else:
432
+ # Fallback for this batch
433
+ for _, row in batch.iterrows():
434
+ kws = [kw.strip().title() for kw in row["keywords"].split(",")][:4]
435
+ labels.append(" / ".join(kws))
436
+ except Exception as exc:
437
+ logger.warning("Groq API error for batch starting at %d: %s", batch_start, exc)
438
+ for _, row in batch.iterrows():
439
+ kws = [kw.strip().title() for kw in row["keywords"].split(",")][:4]
440
+ labels.append(" / ".join(kws))
441
+
442
+ # Rate-limit courtesy delay
443
+ time.sleep(0.5)
444
+
445
+ df = df.copy()
446
+ df["label"] = labels
447
+ return df
448
+
449
+
450
+ # ===================================================================
451
+ # 5. compare_themes — Cross-compare title vs abstract topics
452
+ # ===================================================================
453
+ def compare_themes(
454
+ title_topics: pd.DataFrame,
455
+ abstract_topics: pd.DataFrame,
456
+ ) -> pd.DataFrame:
457
+ """
458
+ Build a comparison table showing dominant themes from titles and
459
+ abstracts side-by-side.
460
+
461
+ Matching strategy:
462
+ - Compute keyword overlap (Jaccard similarity) between every
463
+ title-topic and abstract-topic pair.
464
+ - For each title-topic, find the best matching abstract-topic.
465
+ - Report similarity score and alignment status.
466
+
467
+ Parameters
468
+ ----------
469
+ title_topics : pd.DataFrame
470
+ Topics extracted from titles (with 'topic_id', 'keywords', 'label').
471
+ abstract_topics : pd.DataFrame
472
+ Topics extracted from abstracts (with 'topic_id', 'keywords', 'label').
473
+
474
+ Returns
475
+ -------
476
+ pd.DataFrame
477
+ Comparison table with columns:
478
+ title_topic_id, title_label, title_keywords,
479
+ abstract_topic_id, abstract_label, abstract_keywords,
480
+ similarity, alignment
481
+ """
482
+ logger.info("Comparing themes: %d title topics × %d abstract topics",
483
+ len(title_topics), len(abstract_topics))
484
+
485
+ def _keywords_set(kw_str: str) -> set:
486
+ return set(kw.strip().lower() for kw in kw_str.split(",") if kw.strip())
487
+
488
+ rows = []
489
+ for _, t_row in title_topics.iterrows():
490
+ t_kws = _keywords_set(t_row["keywords"])
491
+ best_sim = 0.0
492
+ best_match = None
493
+
494
+ for _, a_row in abstract_topics.iterrows():
495
+ a_kws = _keywords_set(a_row["keywords"])
496
+ if not t_kws or not a_kws:
497
+ continue
498
+ # Jaccard similarity
499
+ intersection = len(t_kws & a_kws)
500
+ union = len(t_kws | a_kws)
501
+ sim = intersection / union if union else 0.0
502
+ if sim > best_sim:
503
+ best_sim = sim
504
+ best_match = a_row
505
+
506
+ alignment = (
507
+ "Strong" if best_sim >= 0.4
508
+ else "Moderate" if best_sim >= 0.2
509
+ else "Weak" if best_sim > 0
510
+ else "No Match"
511
+ )
512
+
513
+ rows.append({
514
+ "title_topic_id": t_row["topic_id"],
515
+ "title_label": t_row.get("label", ""),
516
+ "title_keywords": t_row["keywords"],
517
+ "abstract_topic_id": best_match["topic_id"] if best_match is not None else None,
518
+ "abstract_label": best_match.get("label", "") if best_match is not None else "",
519
+ "abstract_keywords": best_match["keywords"] if best_match is not None else "",
520
+ "similarity": round(best_sim, 4),
521
+ "alignment": alignment,
522
+ })
523
+
524
+ comparison_df = pd.DataFrame(rows)
525
+ logger.info("Theme comparison complete: %d rows", len(comparison_df))
526
+ return comparison_df
527
+
528
+
529
+ # ===================================================================
530
+ # 6. create_taxonomy_map — Classify themes as MAPPED or NOVEL
531
+ # ===================================================================
532
+ def create_taxonomy_map(
533
+ topics_df: pd.DataFrame,
534
+ known_themes: Optional[List[str]] = None,
535
+ threshold: float = 0.15,
536
+ ) -> Dict[str, Any]:
537
+ """
538
+ Classify each topic as either MAPPED (similar to a well-known
539
+ AI / business / IS research theme) or NOVEL (previously unseen).
540
+
541
+ Heuristic:
542
+ For each topic's keyword set, compute its best token-overlap
543
+ ratio against the known themes list. If the ratio exceeds the
544
+ threshold, label it as MAPPED; otherwise NOVEL.
545
+
546
+ Parameters
547
+ ----------
548
+ topics_df : pd.DataFrame
549
+ Must contain 'topic_id', 'keywords', and 'label' columns.
550
+ known_themes : list of str, optional
551
+ Reference themes (defaults to the built-in KNOWN_THEMES).
552
+ threshold : float
553
+ Minimum overlap ratio to classify as MAPPED.
554
+
555
+ Returns
556
+ -------
557
+ dict
558
+ JSON-serialisable taxonomy map:
559
+ {
560
+ "metadata": { ... },
561
+ "mapped": [ {topic_id, label, keywords, matched_theme, score}, ... ],
562
+ "novel": [ {topic_id, label, keywords, score}, ... ],
563
+ }
564
+ """
565
+ if known_themes is None:
566
+ known_themes = KNOWN_THEMES
567
+
568
+ logger.info("Building taxonomy map for %d topics (threshold=%.2f)", len(topics_df), threshold)
569
+
570
+ mapped: List[Dict] = []
571
+ novel: List[Dict] = []
572
+
573
+ known_tokens_list = [set(theme.lower().split()) for theme in known_themes]
574
+
575
+ for _, row in topics_df.iterrows():
576
+ topic_tokens = set(
577
+ kw.strip().lower()
578
+ for kw in row["keywords"].split(",")
579
+ if kw.strip()
580
+ )
581
+ # Also include individual words from multi-word keywords
582
+ expanded_tokens = set()
583
+ for token in topic_tokens:
584
+ expanded_tokens.update(token.split())
585
+ expanded_tokens.update(topic_tokens)
586
+
587
+ best_score = 0.0
588
+ best_theme = ""
589
+ for theme_str, theme_tokens in zip(known_themes, known_tokens_list):
590
+ if not expanded_tokens or not theme_tokens:
591
+ continue
592
+ intersection = len(expanded_tokens & theme_tokens)
593
+ union_size = len(expanded_tokens | theme_tokens)
594
+ score = intersection / union_size if union_size else 0.0
595
+ if score > best_score:
596
+ best_score = score
597
+ best_theme = theme_str
598
+
599
+ entry = {
600
+ "topic_id": int(row["topic_id"]),
601
+ "label": row.get("label", ""),
602
+ "keywords": row["keywords"],
603
+ "score": round(best_score, 4),
604
+ }
605
+
606
+ if best_score >= threshold:
607
+ entry["matched_theme"] = best_theme
608
+ entry["classification"] = "MAPPED"
609
+ mapped.append(entry)
610
+ else:
611
+ entry["classification"] = "NOVEL"
612
+ novel.append(entry)
613
+
614
+ taxonomy = {
615
+ "metadata": {
616
+ "total_topics": len(topics_df),
617
+ "mapped_count": len(mapped),
618
+ "novel_count": len(novel),
619
+ "threshold": threshold,
620
+ },
621
+ "mapped": mapped,
622
+ "novel": novel,
623
+ }
624
+
625
+ logger.info("Taxonomy: %d MAPPED, %d NOVEL", len(mapped), len(novel))
626
+ return taxonomy