| """tools.py β Sentence-level BERTopic pipeline + Mistral LLM. Version 3.0.0 | 4 April 2026. ZERO for/while/if. |
| |
| PIPELINE: |
| Paper β split into sentences β each sentence gets paper_id + sent_id + metadata |
| β embed sentences (384d) β AgglomerativeClustering cosine β centroid nearest 5 sentences |
| β Mistral labels topics from sentence evidence + paper metadata |
| β one paper can span MULTIPLE topics |
| """ |
| from langchain_core.tools import tool |
| import os |
| import json |
| import re |
| import numpy as np |
| import pandas as pd |
|
|
| |
| |
| |
| DEBUG = True |
| debug = {True: print, False: lambda *a, **k: None}[DEBUG] |
|
|
| CHECKPOINT_DIR = "/tmp/checkpoints" |
| os.makedirs(CHECKPOINT_DIR, exist_ok=True) |
|
|
| NEAREST_K = 5 |
| SENT_SPLIT_RE = r'(?<=[.!?])\s+(?=[A-Z])' |
| MIN_SENT_LEN = 30 |
|
|
| RUN_CONFIGS = { |
| "abstract": ["Abstract"], |
| "title": ["Title"], |
| } |
|
|
| _data = {} |
|
|
|
|
| |
| |
| |
| def _split_sentences(text): |
| """Split text on sentence boundaries. Filters short fragments (<30 chars). |
| Uses regex: split after .!? followed by uppercase letter.""" |
| raw = re.split(SENT_SPLIT_RE, str(text)) |
| return list(filter(lambda s: len(s.strip()) >= MIN_SENT_LEN, raw)) |
|
|
|
|
| |
| |
| |
| @tool |
| def load_scopus_csv(filepath: str) -> str: |
| """Load a Scopus CSV export and show preview. Call this first. |
| |
| Args: |
| filepath: Path to the uploaded .csv file. |
| |
| Returns: |
| Row count, column names, and sample data.""" |
| debug(f"\n>>> TOOL: load_scopus_csv(filepath='{filepath}')") |
| df = pd.read_csv(filepath, encoding="utf-8-sig") |
| _data["df"] = df |
| debug(f">>> Loaded {len(df)} rows, {len(df.columns)} columns") |
| target_cols = list(filter(lambda c: c in df.columns, ["Title", "Abstract", "Author Keywords"])) |
| sample = df[target_cols].head(3).to_string(max_colwidth=80) |
| null_counts = ", ".join(list(map( |
| lambda c: f"{c}: {df[c].notna().sum()}/{len(df)}", target_cols))) |
|
|
| |
| sample_sents = df["Abstract"].head(5).apply(_split_sentences).apply(len) |
| avg_abstract_sents = sample_sents.mean() |
| est_abstract = int(avg_abstract_sents * len(df)) |
| title_count = int(df["Title"].notna().sum()) |
|
|
| return (f"π **Dataset Statistics:**\n" |
| f"- **Papers:** {len(df)}\n" |
| f"- **Abstract sentences:** ~{est_abstract} (~{avg_abstract_sents:.0f} per paper)\n" |
| f"- **Title sentences:** {title_count} (1 per paper)\n" |
| f"- **Non-null:** {null_counts}\n\n" |
| f"Columns: {', '.join(list(df.columns)[:15])}\n\n" |
| f"Sample:\n{sample}") |
|
|
|
|
| |
| |
| |
| @tool |
| def run_bertopic_discovery(run_key: str, threshold: float = 0.7) -> str: |
| """Sentence-level BERTopic: split papers β embed sentences β cosine similarity clustering β centroid nearest 5 β Plotly charts. |
| Each sentence keeps paper_id, sent_id, and metadata. One paper can span multiple topics. |
| Uses AgglomerativeClustering with cosine distance β groups sentences by similarity threshold. |
| |
| Args: |
| run_key: One of 'abstract' or 'title' β selects which columns to split into sentences. |
| threshold: Cosine distance threshold (0.0-1.0). Lower = stricter = more topics. |
| 0.5 = very strict (~2000 topics), 0.7 = recommended (~100 topics, default), 0.8 = loose (~30 topics), 0.9 = very loose (~10 topics). |
| |
| Returns: |
| Topic summary with sentence counts, paper counts, and 5 nearest centroid sentences.""" |
| debug(f"\n>>> TOOL: run_bertopic_discovery(run_key='{run_key}', threshold={threshold})") |
| from bertopic import BERTopic |
| from sentence_transformers import SentenceTransformer |
|
|
| df = _data["df"].copy() |
| cols = RUN_CONFIGS[run_key] |
| available = list(filter(lambda c: c in df.columns, cols)) |
| debug(f">>> Columns: {available}") |
|
|
| |
| df["_text"] = df[available].fillna("").agg(" ".join, axis=1) |
| df["_paper_id"] = df.index |
| debug(f">>> {len(df)} papers assembled") |
|
|
| |
| debug(">>> Splitting into sentences...") |
| df["_sentences"] = df["_text"].apply(_split_sentences) |
| debug(f">>> Sentence counts: min={df['_sentences'].apply(len).min()}, " |
| f"max={df['_sentences'].apply(len).max()}, " |
| f"mean={df['_sentences'].apply(len).mean():.1f}") |
|
|
| |
| meta_cols = ["_paper_id", "Title", "Author Keywords", "_sentences"] |
| available_meta = list(filter(lambda c: c in df.columns, meta_cols)) |
| sent_df = df[available_meta].explode("_sentences").rename( |
| columns={"_sentences": "text"}).reset_index(drop=True) |
| sent_df = sent_df.dropna(subset=["text"]).reset_index(drop=True) |
| sent_df["sent_id"] = sent_df.groupby("_paper_id").cumcount() |
|
|
| |
| |
| |
| debug(">>> Filtering publisher boilerplate...") |
| _n_before = len(sent_df) |
| boilerplate_patterns = "|".join([ |
| r"Licensee MDPI", |
| r"Published by Informa", |
| r"Published by Elsevier", |
| r"Taylor & Francis", |
| r"Copyright Β©", |
| r"Creative Commons", |
| r"open access article", |
| r"Inderscience Enterprises", |
| r"All rights reserved", |
| r"This is an open access", |
| r"distributed under the terms", |
| r"The Author\(s\)", |
| r"Springer Nature", |
| r"Emerald Publishing", |
| r"limitations and future", |
| r"limitations and implications", |
| r"limitations are discussed", |
| r"limitations have been discussed", |
| r"implications are discussed", |
| r"implications were discussed", |
| r"implications are presented", |
| r"concludes with .* implications", |
| ]) |
| clean_mask = ~sent_df["text"].str.contains(boilerplate_patterns, case=False, regex=True, na=False) |
| sent_df = sent_df[clean_mask].reset_index(drop=True) |
| sent_df["sent_id"] = sent_df.groupby("_paper_id").cumcount() |
| debug(f">>> Filtered: {_n_before} β {len(sent_df)} sentences ({_n_before - len(sent_df)} boilerplate removed)") |
| n_sentences = len(sent_df) |
| n_papers = len(df) |
| debug(f">>> {n_sentences} sentences from {n_papers} papers") |
|
|
| |
| |
| |
| debug(">>> Embedding sentences with all-MiniLM-L6-v2 (L2-normalized)...") |
| docs = sent_df["text"].tolist() |
| embedder = SentenceTransformer("all-MiniLM-L6-v2") |
| embeddings = embedder.encode(docs, show_progress_bar=False, normalize_embeddings=True) |
| debug(f">>> Embeddings: {embeddings.shape}, normalized: True") |
|
|
| |
| np.save(f"{CHECKPOINT_DIR}/rq4_{run_key}_emb.npy", embeddings) |
|
|
| |
| |
| |
| debug(f">>> AgglomerativeClustering cosine threshold={threshold} on 384d embeddings...") |
| from sklearn.preprocessing import FunctionTransformer |
| from sklearn.cluster import AgglomerativeClustering |
| no_umap = FunctionTransformer() |
| cluster_model = AgglomerativeClustering( |
| n_clusters=None, |
| metric="cosine", |
| linkage="average", |
| distance_threshold=threshold, |
| ) |
| topic_model = BERTopic( |
| hdbscan_model=cluster_model, |
| umap_model=no_umap, |
| ) |
| topics, probs = topic_model.fit_transform(docs, embeddings) |
| n_topics = len(set(topics)) - int(-1 in topics) |
| n_outliers = int(np.sum(np.array(topics) == -1)) |
| debug(f">>> {n_topics} topics, {n_outliers} outlier sentences") |
|
|
| |
| _data[f"{run_key}_model"] = topic_model |
| _data[f"{run_key}_topics"] = np.array(topics) |
| _data[f"{run_key}_embeddings"] = embeddings |
| _data[f"{run_key}_sent_df"] = sent_df |
|
|
| |
| debug(f">>> Generating visualizations ({n_topics} topics)...") |
| |
| (n_topics >= 3) and topic_model.visualize_topics().write_html( |
| f"/tmp/rq4_{run_key}_intertopic.html", include_plotlyjs="cdn") |
| |
| (n_topics >= 1) and topic_model.visualize_barchart( |
| top_n_topics=min(10, max(1, n_topics))).write_html( |
| f"/tmp/rq4_{run_key}_bars.html", include_plotlyjs="cdn") |
| |
| (n_topics >= 2) and topic_model.visualize_hierarchy().write_html( |
| f"/tmp/rq4_{run_key}_hierarchy.html", include_plotlyjs="cdn") |
| |
| (n_topics >= 2) and topic_model.visualize_heatmap().write_html( |
| f"/tmp/rq4_{run_key}_heatmap.html", include_plotlyjs="cdn") |
| debug(f">>> Visualizations saved (skipped charts needing more topics)") |
|
|
| |
| topics_arr = np.array(topics) |
| topic_info = topic_model.get_topic_info() |
| valid_rows = list(filter(lambda r: r["Topic"] != -1, topic_info.to_dict("records"))) |
|
|
| def _centroid_nearest(row): |
| """Find 5 sentences nearest to topic centroid via cosine similarity.""" |
| mask = topics_arr == row["Topic"] |
| member_idx = np.where(mask)[0] |
| member_embs = embeddings[mask] |
| centroid = member_embs.mean(axis=0) |
| |
| norms = np.linalg.norm(member_embs, axis=1) * np.linalg.norm(centroid) |
| cosine_sim = (member_embs @ centroid) / (norms + 1e-10) |
| dists = 1 - cosine_sim |
| nearest = np.argsort(dists)[:NEAREST_K] |
|
|
| |
| nearest_evidence = list(map(lambda i: { |
| "sentence": str(sent_df.iloc[member_idx[i]]["text"])[:250], |
| "paper_id": int(sent_df.iloc[member_idx[i]]["_paper_id"]), |
| "title": str(sent_df.iloc[member_idx[i]].get("Title", ""))[:150], |
| "keywords": str(sent_df.iloc[member_idx[i]].get("Author Keywords", ""))[:150], |
| }, nearest)) |
|
|
| |
| topic_papers_df = sent_df.iloc[member_idx].drop_duplicates(subset=["_paper_id"]) |
| unique_papers = len(topic_papers_df) |
| paper_titles = list(map( |
| lambda idx: str(topic_papers_df.iloc[idx].get("Title", ""))[:200], |
| range(min(50, unique_papers)))) |
|
|
| return {"topic_id": int(row["Topic"]), |
| "sentence_count": int(row["Count"]), |
| "paper_count": int(unique_papers), |
| "top_words": str(row.get("Name", ""))[:100], |
| "nearest": nearest_evidence, |
| "paper_titles": paper_titles} |
|
|
| summaries = list(map(_centroid_nearest, valid_rows)) |
| json.dump(summaries, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_summaries.json", "w"), indent=2, default=str) |
| debug(f">>> {len(summaries)} topics saved ({NEAREST_K} nearest sentences each)") |
|
|
| |
| lines = list(map( |
| lambda s: f" Topic {s['topic_id']} ({s['sentence_count']} sentences, {s['paper_count']} papers): {s['top_words']}", |
| summaries)) |
| return (f"[{run_key}] {n_topics} topics from {n_sentences} sentences ({n_papers} papers, {n_outliers} outliers).\n\n" |
| + "\n".join(lines) |
| + f"\n\nVisualizations: /tmp/rq4_{run_key}_*.html (4 files)" |
| + f"\nCheckpoints: {CHECKPOINT_DIR}/rq4_{run_key}_emb.npy + summaries.json") |
|
|
|
|
| |
| |
| |
| @tool |
| def label_topics_with_llm(run_key: str) -> str: |
| """Send 5 nearest centroid sentences + paper metadata to Mistral for labeling. |
| Each sentence shows which paper it came from (title + keywords). |
| |
| Args: |
| run_key: One of 'abstract' or 'title'. |
| |
| Returns: |
| Labeled topics with sentence-level evidence.""" |
| debug(f"\n>>> TOOL: label_topics_with_llm(run_key='{run_key}')") |
| from langchain_mistralai import ChatMistralAI |
| from langchain_core.prompts import PromptTemplate |
| from langchain_core.output_parsers import JsonOutputParser |
|
|
| summaries = json.load(open(f"{CHECKPOINT_DIR}/rq4_{run_key}_summaries.json")) |
| debug(f">>> Loaded {len(summaries)} topics ({NEAREST_K} sentences each)") |
|
|
| |
| MAX_LABEL_TOPICS = 100 |
| sorted_summaries = sorted(summaries, key=lambda s: s.get("sentence_count", 0), reverse=True) |
| summaries_to_label = sorted_summaries[:MAX_LABEL_TOPICS] |
| skipped = max(0, len(summaries) - MAX_LABEL_TOPICS) |
| debug(f">>> Labeling top {len(summaries_to_label)} topics (skipped {skipped} small clusters)") |
|
|
| |
| topics_block = "\n\n".join(list(map( |
| lambda s: (f"Topic {s['topic_id']} ({s['sentence_count']} sentences from {s['paper_count']} papers):\n" |
| f" Top words: {s['top_words']}\n" |
| f" {NEAREST_K} nearest centroid sentences:\n" |
| + "\n".join(list(map( |
| lambda e: (f" - \"{e['sentence'][:200]}\"\n" |
| f" Paper: \"{e['title']}\"\n" |
| f" Keywords: {e['keywords']}"), |
| s["nearest"])))), |
| summaries_to_label))) |
|
|
| prompt = PromptTemplate.from_template( |
| "You are a research topic classifier for academic papers about Technology and Tourism.\n\n" |
| "For EACH topic below, you are given the 5 sentences nearest to the topic centroid,\n" |
| "plus the paper title and author keywords each sentence came from.\n\n" |
| "Return a JSON ARRAY with one object per topic:\n" |
| "- topic_id: integer\n" |
| "- label: short descriptive name (3-6 words, specific β NOT generic like 'tourism studies')\n" |
| "- category: general research area (e.g., 'technology adoption', 'consumer behavior',\n" |
| " 'virtual reality', 'social media marketing', 'sustainability', 'cultural heritage',\n" |
| " 'AI and machine learning', 'online reviews', 'destination marketing',\n" |
| " 'tourist psychology', 'hotel management', 'sharing economy',\n" |
| " 'mobile applications', 'research methodology', 'data analytics')\n" |
| " DO NOT use PACIS/ICIS categories β just plain descriptive research area.\n" |
| "- confidence: high, medium, or low\n" |
| "- reasoning: 1 sentence explaining WHY you chose this label based on the evidence sentences\n" |
| "- niche: true or false (true = very specific sub-area with <20 sentences)\n\n" |
| "CRITICAL: be SPECIFIC in labels. Do NOT use broad terms.\n" |
| "Return ONLY valid JSON array, no markdown.\n\n" |
| "Topics:\n{topics}") |
|
|
| llm = ChatMistralAI(model="mistral-small-latest", temperature=0, timeout=300) |
| chain = prompt | llm | JsonOutputParser() |
| debug(">>> Calling Mistral (single call, all topics)...") |
| labels = chain.invoke({"topics": topics_block}) |
| debug(f">>> Got {len(labels)} labels") |
|
|
| |
| labeled = list(map(lambda pair: {**pair[0], **pair[1]}, |
| zip(summaries, (labels + summaries)[:len(summaries)]))) |
| json.dump(labeled, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_labels.json", "w"), indent=2, default=str) |
| debug(f">>> Labels saved: {CHECKPOINT_DIR}/rq4_{run_key}_labels.json") |
|
|
| |
| lines = list(map( |
| lambda l: (f" **Topic {l.get('topic_id', '?')}: {l.get('label', '?')}** " |
| f"[{l.get('category', '?')}] conf={l.get('confidence', '?')} " |
| f"({l.get('sentence_count', 0)} sentences, {l.get('paper_count', 0)} papers)\n" |
| + "\n".join(list(map( |
| lambda e: f" β \"{e['sentence'][:120]}...\" β _{e['title'][:60]}_", |
| l.get("nearest", []))))), |
| labeled)) |
| return f"[{run_key}] {len(labeled)} topics labeled by Mistral:\n\n" + "\n\n".join(lines) |
|
|
|
|
| |
| |
| |
| @tool |
| def generate_comparison_csv() -> str: |
| """Compare Mistral-labeled topics across completed runs. Includes sentence + paper counts. |
| |
| Returns: |
| Comparison table + CSV path.""" |
| debug(f"\n>>> TOOL: generate_comparison_csv()") |
| completed = list(filter( |
| lambda k: os.path.exists(f"{CHECKPOINT_DIR}/rq4_{k}_labels.json"), RUN_CONFIGS.keys())) |
| debug(f">>> Completed runs: {completed}") |
|
|
| def _load_run(run_key): |
| labels = json.load(open(f"{CHECKPOINT_DIR}/rq4_{run_key}_labels.json")) |
| return list(map(lambda l: { |
| "run": run_key, "topic_id": l.get("topic_id", ""), |
| "label": l.get("label", ""), "category": l.get("category", ""), |
| "confidence": l.get("confidence", ""), "niche": l.get("niche", ""), |
| "sentences": l.get("sentence_count", 0), |
| "papers": l.get("paper_count", 0), |
| "top_words": l.get("top_words", ""), |
| }, labels)) |
|
|
| all_rows = sum(list(map(_load_run, completed)), []) |
| df = pd.DataFrame(all_rows) |
| path = "/tmp/rq4_comparison.csv" |
| df.to_csv(path, index=False) |
| debug(f">>> Comparison CSV: {path} ({len(df)} rows)") |
| return f"Comparison saved: {path} ({len(completed)} runs, {len(df)} topics)\n\n{df.to_string(index=False)}" |
|
|
|
|
| |
| |
| |
| @tool |
| def export_narrative(run_key: str) -> str: |
| """Generate 500-word narrative for research paper Section 7 via Mistral. |
| |
| Args: |
| run_key: One of 'abstract' or 'title'. |
| |
| Returns: |
| 500-word narrative + save path.""" |
| debug(f"\n>>> TOOL: export_narrative(run_key='{run_key}')") |
| from langchain_mistralai import ChatMistralAI |
|
|
| labels = json.load(open(f"{CHECKPOINT_DIR}/rq4_{run_key}_labels.json")) |
| topics_text = "\n".join(list(map( |
| lambda l: f"- {l.get('label', '?')} ({l.get('sentence_count', 0)} sentences from " |
| f"{l.get('paper_count', 0)} papers, category: {l.get('category', '?')}, " |
| f"confidence: {l.get('confidence', '?')}, niche: {l.get('niche', '?')})", |
| labels))) |
|
|
| llm = ChatMistralAI(model="mistral-small-latest", temperature=0.3, timeout=300) |
| result = llm.invoke( |
| f"Write exactly 500 words for a research paper Section 7 titled " |
| f"'Topic Modeling Results β BERTopic Discovery'.\n\n" |
| f"Dataset: 1390 Scopus papers on Tourism and AI.\n" |
| f"Method: Sentence-level BERTopic β each abstract split into sentences,\n" |
| f"embedded with all-MiniLM-L6-v2 (384d), clustered with AgglomerativeClustering (cosine).\n" |
| f"Note: One paper can contribute sentences to MULTIPLE topics.\n" |
| f"Run config: '{run_key}' columns.\n\n" |
| f"Topics discovered:\n{topics_text}\n\n" |
| f"Include: methodology justification for sentence-level approach,\n" |
| f"key themes, emerging niches, limitations, future work.") |
|
|
| path = "/tmp/rq4_narrative.txt" |
| open(path, "w", encoding="utf-8").write(result.content) |
| debug(f">>> Narrative saved: {path} ({len(result.content)} chars)") |
| return f"Narrative saved: {path}\n\n{result.content}" |
|
|
|
|
| |
| |
| |
| @tool |
| def consolidate_into_themes(run_key: str, theme_map: dict) -> str: |
| """ROUND 2: Merge fine-grained Round 1 topics into broader themes. |
| Researcher decides which topics to group. Recomputes centroids and evidence. |
| |
| Args: |
| run_key: 'abstract' or 'title'. |
| theme_map: Dict mapping theme names to topic ID lists. |
| Example: {"AI in Tourism": [0, 1, 5], "VR Tourism": [2, 3]} |
| |
| Returns: |
| Consolidated themes with new 5-nearest sentence evidence per theme.""" |
| debug(f"\n>>> TOOL: consolidate_into_themes(run_key='{run_key}', {len(theme_map)} themes)") |
|
|
| topics_arr = _data[f"{run_key}_topics"] |
| embeddings = _data[f"{run_key}_embeddings"] |
| sent_df = _data[f"{run_key}_sent_df"] |
|
|
| def _build_theme(item): |
| """Merge listed topics into one theme. Recompute centroid + 5 nearest.""" |
| theme_name, topic_ids = item |
| mask = np.isin(topics_arr, topic_ids) |
| member_idx = np.where(mask)[0] |
| member_embs = embeddings[mask] |
| centroid = member_embs.mean(axis=0) |
| norms = np.linalg.norm(member_embs, axis=1) * np.linalg.norm(centroid) |
| cosine_sim = (member_embs @ centroid) / (norms + 1e-10) |
| dists = 1 - cosine_sim |
| nearest = np.argsort(dists)[:NEAREST_K] |
|
|
| nearest_evidence = list(map(lambda i: { |
| "sentence": str(sent_df.iloc[member_idx[i]]["text"])[:250], |
| "paper_id": int(sent_df.iloc[member_idx[i]]["_paper_id"]), |
| "title": str(sent_df.iloc[member_idx[i]].get("Title", ""))[:150], |
| "keywords": str(sent_df.iloc[member_idx[i]].get("Author Keywords", ""))[:150], |
| }, nearest)) |
|
|
| unique_papers = sent_df.iloc[member_idx]["_paper_id"].nunique() |
|
|
| |
| topic_papers_df = sent_df.iloc[member_idx].drop_duplicates(subset=["_paper_id"]) |
| paper_titles = list(map( |
| lambda idx: str(topic_papers_df.iloc[idx].get("Title", ""))[:200], |
| range(min(50, len(topic_papers_df))))) |
|
|
| return {"label": theme_name, "merged_topics": list(topic_ids), |
| "sentence_count": int(mask.sum()), "paper_count": int(unique_papers), |
| "nearest": nearest_evidence, "paper_titles": paper_titles} |
|
|
| |
| themes_raw = list(map(_build_theme, theme_map.items())) |
| themes = list(map( |
| lambda pair: {**pair[1], "topic_id": pair[0]}, |
| enumerate(themes_raw))) |
| json.dump(themes, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_themes.json", "w"), indent=2, default=str) |
| debug(f">>> {len(themes)} themes saved: {CHECKPOINT_DIR}/rq4_{run_key}_themes.json") |
|
|
| |
| lines = list(map( |
| lambda t: (f" **{t['label']}** ({t['sentence_count']} sentences, {t['paper_count']} papers)\n" |
| f" Merged from topics: {t['merged_topics']}\n" |
| f" Evidence:\n" |
| + "\n".join(list(map( |
| lambda e: f" β \"{e['sentence'][:120]}...\" β _{e['title'][:60]}_", |
| t["nearest"])))), |
| themes)) |
| return f"[{run_key}] Round 2: {len(themes)} themes consolidated:\n\n" + "\n\n".join(lines) |
|
|
|
|
| |
| |
| |
|
|
| |
| |
| |
| PAJAIS_TAXONOMY = [ |
| "Electronic and Mobile Business / Social Commerce", |
| "Human Behavior and IS / Human-Computer Interaction", |
| "IS/IT Strategy, Leadership, Governance", |
| "Business Intelligence and Data Analytics", |
| "Design Science and IS", |
| "Enterprise Systems and BPM", |
| "IS Implementation, Adoption, and Diffusion", |
| "Social Media and Business Impact", |
| "Cultural and Global Issues in IS", |
| "IS Security and Privacy", |
| "IS Smart / IoT", |
| "Knowledge Management", |
| "ICT / Digital Platform / IT and Work", |
| "IS Healthcare", |
| "IT Project Management", |
| "Service Science and IS", |
| "Social and Organizational Aspects of IS", |
| "Research Methods and Philosophy", |
| "E-Finance / Economics of IS", |
| "E-Government", |
| "IS Education and Learning", |
| "Green IT and Sustainability", |
| ] |
|
|
|
|
| @tool |
| def compare_with_taxonomy(run_key: str) -> str: |
| """Compare BERTopic themes against established PAJAIS/PACIS taxonomy |
| (Jiang, Liang & Tsai, 2019). Identifies which themes map to known |
| categories and which are NOVEL/EMERGING (not in existing taxonomy). |
| Researcher reviews mapping and approves new theme consolidation. |
| |
| Args: |
| run_key: 'abstract' or 'title'. |
| |
| Returns: |
| Mapping table: BERTopic theme β PAJAIS category (or NOVEL).""" |
| debug(f"\n>>> TOOL: compare_with_taxonomy(run_key='{run_key}')") |
| from langchain_mistralai import ChatMistralAI |
| from langchain_core.prompts import PromptTemplate |
| from langchain_core.output_parsers import JsonOutputParser |
|
|
| |
| themes_path = f"{CHECKPOINT_DIR}/rq4_{run_key}_themes.json" |
| labels_path = f"{CHECKPOINT_DIR}/rq4_{run_key}_labels.json" |
| source_path = (os.path.exists(themes_path) and themes_path) or labels_path |
| themes = json.load(open(source_path)) |
| debug(f">>> Loaded {len(themes)} themes from {source_path}") |
|
|
| |
| themes_text = "\n".join(list(map( |
| lambda t: f"- {t.get('label', '?')} " |
| f"({t.get('paper_count', t.get('count', '?'))} papers)", |
| themes))) |
|
|
| taxonomy_text = "\n".join(list(map(lambda c: f"- {c}", PAJAIS_TAXONOMY))) |
|
|
| prompt = PromptTemplate.from_template( |
| "You are an IS research taxonomy expert.\n\n" |
| "Compare each BERTopic theme against the established PAJAIS/PACIS taxonomy.\n" |
| "For EACH theme, return a JSON ARRAY with:\n" |
| "- label: the BERTopic theme name\n" |
| "- pajais_match: closest PAJAIS category (or 'NOVEL' if no match)\n" |
| "- match_confidence: high, medium, low, or none\n" |
| "- reasoning: why this mapping (1 sentence)\n" |
| "- is_novel: true if this theme represents an emerging area not in the taxonomy\n\n" |
| "Return ONLY valid JSON array.\n\n" |
| "BERTopic Themes:\n{themes}\n\n" |
| "PAJAIS Taxonomy (Jiang et al., 2019):\n{taxonomy}") |
|
|
| llm = ChatMistralAI(model="mistral-small-latest", temperature=0, timeout=300) |
| chain = prompt | llm | JsonOutputParser() |
| debug(">>> Calling Mistral for taxonomy comparison...") |
| mappings = chain.invoke({"themes": themes_text, "taxonomy": taxonomy_text}) |
| debug(f">>> Got {len(mappings)} mappings") |
|
|
| |
| json.dump(mappings, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_taxonomy_map.json", "w"), indent=2, default=str) |
|
|
| |
| novel = list(filter(lambda m: m.get("is_novel", False), mappings)) |
| mapped = list(filter(lambda m: not m.get("is_novel", False), mappings)) |
|
|
| |
| mapped_lines = list(map( |
| lambda m: f" β
{m.get('label', '?')} β **{m.get('pajais_match', '?')}** " |
| f"(conf={m.get('match_confidence', '?')}) _{m.get('reasoning', '')}_", |
| mapped)) |
| novel_lines = list(map( |
| lambda m: f" π **{m.get('label', '?')}** β NOVEL " |
| f"_{m.get('reasoning', '')}_", |
| novel)) |
|
|
| return (f"[{run_key}] Taxonomy comparison (Jiang et al., 2019):\n\n" |
| f"**Mapped to PAJAIS categories ({len(mapped)}):**\n" + "\n".join(mapped_lines) + |
| f"\n\n**NOVEL / Emerging themes ({len(novel)}):**\n" + "\n".join(novel_lines) + |
| f"\n\nSaved: {CHECKPOINT_DIR}/rq4_{run_key}_taxonomy_map.json") |
|
|
|
|
| |
| |
| |
| def get_all_tools(): |
| """Return all 7 tools with error handling enabled.""" |
| tools = [load_scopus_csv, run_bertopic_discovery, label_topics_with_llm, |
| consolidate_into_themes, compare_with_taxonomy, |
| generate_comparison_csv, export_narrative] |
| list(map(lambda t: setattr(t, 'handle_tool_error', True), tools)) |
| debug(f">>> tools.py: {len(tools)} tools ready (handle_tool_error=True)") |
| list(map(lambda t: debug(f">>> - {t.name}"), tools)) |
| return tools |
|
|