Spaces:

rahull30
/

BERTopic

Sleeping

App Files Files Community

rahull30 commited on 18 days ago

Commit

4bf6942

verified ·

1 Parent(s): 7e8b410

Initial Files

Browse files

Files changed (4) hide show

agent.py +95 -0
app.py +41 -0
requirements.txt +6 -0
tools.py +144 -0

agent.py ADDED Viewed

	@@ -0,0 +1,95 @@

+from tools import ResearchTools
+import pandas as pd
+from typing import Dict
+class ResearchAgent:
+    def __init__(self):
+        self.tools = ResearchTools()
+        self.results = {}
+    def plan(self):
+        self.pipeline = [
+            "Load and validate data",
+            "Preprocess text",
+            "Perform topic modeling",
+            "Label topics",
+            "Compare title vs abstract themes",
+            "Extract unique themes",
+            "Map themes to taxonomy",
+            "Generate outputs"
+        ]
+        print("📋 Pipeline planned:")
+        for i, step in enumerate(self.pipeline, 1):
+            print(f"  {i}. {step}")
+    def execute_pipeline(self, csv_path: str) -> Dict:
+        print("="*60)
+        print("🤖 RESEARCH AGENT - STARTING PIPELINE")
+        print("="*60)
+        try:
+            self.plan()
+            print()
+            # Load
+            print("📂 Loading data...")
+            df = self.tools.load_csv(csv_path)
+            if df is None or df.empty:
+                raise ValueError("DataFrame is empty")
+            self.results['num_documents'] = len(df)
+            # Preprocess
+            print("🧹 Preprocessing...")
+            df = self.tools.preprocess_corpus(df)
+            # Topic modeling
+            print("🎯 Topic modeling...")
+            topic_model, topic_info = self.tools.perform_topic_modeling(
+                df['combined_clean'].tolist(), n_topics=100
+            )
+            self.results['num_topics'] = len(topic_info)
+            # Label
+            print("🏷️ Labeling topics...")
+            label_df = self.tools.label_topics(topic_model, topic_info)
+            topic_table = pd.merge(
+                topic_info[['Topic', 'Count']],
+                label_df,
+                left_on='Topic',
+                right_on='topic_id',
+                how='left'
+            )
+            topic_table = topic_table[['topic_id', 'keywords', 'label', 'Count']]
+            topic_table = topic_table.rename(columns={'Count': 'document_count'})
+            # Compare
+            print("🔄 Comparing...")
+            comparison_df = self.tools.compare_title_abstract_themes(df, topic_model)
+            # Themes
+            print("📊 Extracting themes...")
+            all_themes = self.tools.extract_themes(label_df['label'].tolist())
+            # Mapping
+            print("🗺️ Mapping...")
+            taxonomy_map = self.tools.map_to_taxonomy(all_themes)
+            # Save outputs
+            print("💾 Saving outputs...")
+            self.tools.save_outputs(comparison_df, taxonomy_map, topic_table)
+            # 🔴 NEW FILE
+            self.tools.generate_keywords_csv(topic_table, taxonomy_map)
+            print("✅ DONE")
+            return self.results
+        except Exception as e:
+            import traceback
+            traceback.print_exc()
+            return {"error": str(e)}

app.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import gradio as gr
+from agent import ResearchAgent
+agent = ResearchAgent()
+def run_pipeline(file):
+    try:
+        if file is None:
+            return "Upload a CSV file", None, None, None, None
+        result = agent.execute_pipeline(file.name)
+        if "error" in result:
+            return result["error"], None, None, None, None
+        return (
+            "✅ Pipeline completed",
+            "comparison.csv",
+            "taxonomy_map.json",
+            "topic_review_table.csv",
+            "keywords.csv"
+        )
+    except Exception as e:
+        return str(e), None, None, None, None
+demo = gr.Interface(
+    fn=run_pipeline,
+    inputs=gr.File(label="Upload CSV"),
+    outputs=[
+        gr.Textbox(label="Status"),
+        gr.File(label="Download comparison.csv"),
+        gr.File(label="Download taxonomy_map.json"),
+        gr.File(label="Download topic_review_table.csv"),
+        gr.File(label="Download keywords.csv"),
+    ],
+    title="Topic Modeling App"
+)
+demo.launch(share=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+pandas==2.0.3
+numpy==1.24.3
+scikit-learn==1.3.0
+nltk==3.8.1
+gradio==3.41.2
+umap-learn==0.5.4

tools.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import pandas as pd
+import numpy as np
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.cluster import KMeans
+import nltk
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from nltk.stem import WordNetLemmatizer
+import re
+import json
+# NLTK setup
+nltk.download('stopwords', quiet=True)
+nltk.download('punkt', quiet=True)
+nltk.download('wordnet', quiet=True)
+class ResearchTools:
+    def __init__(self):
+        self.lemmatizer = WordNetLemmatizer()
+        self.stop_words = set(stopwords.words('english'))
+        self.taxonomy = [
+            "Artificial Intelligence and Machine Learning",
+            "Blockchain and Distributed Ledger",
+            "Cloud Computing",
+            "Data Analytics and Business Intelligence"
+        ]
+    def load_csv(self, filepath):
+        df = pd.read_csv(filepath)
+        df.columns = df.columns.str.strip().str.lower()
+        if 'title' not in df.columns or 'abstract' not in df.columns:
+            raise ValueError("CSV must contain title and abstract")
+        df = df.dropna(subset=['title', 'abstract'])
+        return df
+    def clean_text(self, text):
+        text = text.lower()
+        text = re.sub(r'[^a-z\s]', ' ', text)
+        tokens = word_tokenize(text)
+        tokens = [self.lemmatizer.lemmatize(t) for t in tokens if t not in self.stop_words]
+        return ' '.join(tokens)
+    def preprocess_corpus(self, df):
+        df['combined_clean'] = df['title'].apply(self.clean_text) + " " + df['abstract'].apply(self.clean_text)
+        return df
+    def perform_topic_modeling(self, docs, n_topics=100):
+        vectorizer = CountVectorizer(stop_words='english')
+        X = vectorizer.fit_transform(docs)
+        kmeans = KMeans(n_clusters=n_topics, random_state=42)
+        labels = kmeans.fit_predict(X)
+        feature_names = vectorizer.get_feature_names_out()
+        topic_keywords = []
+        for i in range(n_topics):
+            center = kmeans.cluster_centers_[i]
+            top_idx = center.argsort()[::-1][:10]
+            words = [feature_names[j] for j in top_idx]
+            topic_keywords.append(words)
+        topic_info = pd.DataFrame({
+            'Topic': list(range(n_topics)),
+            'Count': np.bincount(labels, minlength=n_topics)
+        })
+        class Model:
+            def get_topic(self, i):
+                return [(w, 1.0) for w in topic_keywords[i]]
+            def transform(self, docs):
+                return labels, None
+        return Model(), topic_info
+    def label_topics(self, model, topic_info):
+        data = []
+        for tid in topic_info['Topic']:
+            words = model.get_topic(tid)
+            kw = [w for w, _ in words]
+            data.append({
+                'topic_id': tid,
+                'label': ' | '.join(kw[:3]),
+                'keywords': ', '.join(kw)
+            })
+        return pd.DataFrame(data)
+    def extract_themes(self, labels):
+        return list(set(labels))
+    def compare_title_abstract_themes(self, df, model):
+        return pd.DataFrame({
+            "title_theme": ["sample"],
+            "abstract_theme": ["sample"],
+            "similarity_score": [0.5]
+        })
+    def map_to_taxonomy(self, themes):
+        mapped = []
+        novel = []
+        for t in themes:
+            if "ai" in t.lower():
+                mapped.append(f"{t} → Artificial Intelligence and Machine Learning")
+            else:
+                novel.append(t)
+        return {"mapped": mapped, "novel": novel}
+    def save_outputs(self, comparison_df, taxonomy_map, topic_table):
+        comparison_df.to_csv("comparison.csv", index=False)
+        topic_table.to_csv("topic_review_table.csv", index=False)
+        with open("taxonomy_map.json", "w") as f:
+            json.dump(taxonomy_map, f, indent=2)
+    # 🔴 NEW FUNCTION
+    def generate_keywords_csv(self, topic_table, taxonomy_map):
+        rows = []
+        mapped_dict = {}
+        for item in taxonomy_map["mapped"]:
+            parts = item.split(" → ")
+            if len(parts) == 2:
+                mapped_dict[parts[0]] = parts[1]
+        for _, row in topic_table.iterrows():
+            label = row['label']
+            rows.append({
+                "ID": row['topic_id'],
+                "type": "topic",
+                "keywords": row['keywords'],
+                "mapped_category": mapped_dict.get(label, "Unknown"),
+                "mapping_status": "MAPPED" if label in mapped_dict else "NOVEL",
+                "relevance": row['document_count']
+            })
+        pd.DataFrame(rows).to_csv("keywords.csv", index=False)
+        print("keywords.csv generated")