rahull30 commited on
Commit
4bf6942
·
verified ·
1 Parent(s): 7e8b410

Initial Files

Browse files
Files changed (4) hide show
  1. agent.py +95 -0
  2. app.py +41 -0
  3. requirements.txt +6 -0
  4. tools.py +144 -0
agent.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tools import ResearchTools
2
+ import pandas as pd
3
+ from typing import Dict
4
+
5
+ class ResearchAgent:
6
+
7
+ def __init__(self):
8
+ self.tools = ResearchTools()
9
+ self.results = {}
10
+
11
+ def plan(self):
12
+ self.pipeline = [
13
+ "Load and validate data",
14
+ "Preprocess text",
15
+ "Perform topic modeling",
16
+ "Label topics",
17
+ "Compare title vs abstract themes",
18
+ "Extract unique themes",
19
+ "Map themes to taxonomy",
20
+ "Generate outputs"
21
+ ]
22
+ print("📋 Pipeline planned:")
23
+ for i, step in enumerate(self.pipeline, 1):
24
+ print(f" {i}. {step}")
25
+
26
+ def execute_pipeline(self, csv_path: str) -> Dict:
27
+ print("="*60)
28
+ print("🤖 RESEARCH AGENT - STARTING PIPELINE")
29
+ print("="*60)
30
+
31
+ try:
32
+ self.plan()
33
+ print()
34
+
35
+ # Load
36
+ print("📂 Loading data...")
37
+ df = self.tools.load_csv(csv_path)
38
+ if df is None or df.empty:
39
+ raise ValueError("DataFrame is empty")
40
+
41
+ self.results['num_documents'] = len(df)
42
+
43
+ # Preprocess
44
+ print("🧹 Preprocessing...")
45
+ df = self.tools.preprocess_corpus(df)
46
+
47
+ # Topic modeling
48
+ print("🎯 Topic modeling...")
49
+ topic_model, topic_info = self.tools.perform_topic_modeling(
50
+ df['combined_clean'].tolist(), n_topics=100
51
+ )
52
+
53
+ self.results['num_topics'] = len(topic_info)
54
+
55
+ # Label
56
+ print("🏷️ Labeling topics...")
57
+ label_df = self.tools.label_topics(topic_model, topic_info)
58
+
59
+ topic_table = pd.merge(
60
+ topic_info[['Topic', 'Count']],
61
+ label_df,
62
+ left_on='Topic',
63
+ right_on='topic_id',
64
+ how='left'
65
+ )
66
+
67
+ topic_table = topic_table[['topic_id', 'keywords', 'label', 'Count']]
68
+ topic_table = topic_table.rename(columns={'Count': 'document_count'})
69
+
70
+ # Compare
71
+ print("🔄 Comparing...")
72
+ comparison_df = self.tools.compare_title_abstract_themes(df, topic_model)
73
+
74
+ # Themes
75
+ print("📊 Extracting themes...")
76
+ all_themes = self.tools.extract_themes(label_df['label'].tolist())
77
+
78
+ # Mapping
79
+ print("🗺️ Mapping...")
80
+ taxonomy_map = self.tools.map_to_taxonomy(all_themes)
81
+
82
+ # Save outputs
83
+ print("💾 Saving outputs...")
84
+ self.tools.save_outputs(comparison_df, taxonomy_map, topic_table)
85
+
86
+ # 🔴 NEW FILE
87
+ self.tools.generate_keywords_csv(topic_table, taxonomy_map)
88
+
89
+ print("✅ DONE")
90
+ return self.results
91
+
92
+ except Exception as e:
93
+ import traceback
94
+ traceback.print_exc()
95
+ return {"error": str(e)}
app.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from agent import ResearchAgent
3
+
4
+ agent = ResearchAgent()
5
+
6
+ def run_pipeline(file):
7
+ try:
8
+ if file is None:
9
+ return "Upload a CSV file", None, None, None, None
10
+
11
+ result = agent.execute_pipeline(file.name)
12
+
13
+ if "error" in result:
14
+ return result["error"], None, None, None, None
15
+
16
+ return (
17
+ "✅ Pipeline completed",
18
+ "comparison.csv",
19
+ "taxonomy_map.json",
20
+ "topic_review_table.csv",
21
+ "keywords.csv"
22
+ )
23
+
24
+ except Exception as e:
25
+ return str(e), None, None, None, None
26
+
27
+
28
+ demo = gr.Interface(
29
+ fn=run_pipeline,
30
+ inputs=gr.File(label="Upload CSV"),
31
+ outputs=[
32
+ gr.Textbox(label="Status"),
33
+ gr.File(label="Download comparison.csv"),
34
+ gr.File(label="Download taxonomy_map.json"),
35
+ gr.File(label="Download topic_review_table.csv"),
36
+ gr.File(label="Download keywords.csv"),
37
+ ],
38
+ title="Topic Modeling App"
39
+ )
40
+
41
+ demo.launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ pandas==2.0.3
2
+ numpy==1.24.3
3
+ scikit-learn==1.3.0
4
+ nltk==3.8.1
5
+ gradio==3.41.2
6
+ umap-learn==0.5.4
tools.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.feature_extraction.text import CountVectorizer
4
+ from sklearn.cluster import KMeans
5
+ import nltk
6
+ from nltk.corpus import stopwords
7
+ from nltk.tokenize import word_tokenize
8
+ from nltk.stem import WordNetLemmatizer
9
+ import re
10
+ import json
11
+
12
+ # NLTK setup
13
+ nltk.download('stopwords', quiet=True)
14
+ nltk.download('punkt', quiet=True)
15
+ nltk.download('wordnet', quiet=True)
16
+
17
+ class ResearchTools:
18
+
19
+ def __init__(self):
20
+ self.lemmatizer = WordNetLemmatizer()
21
+ self.stop_words = set(stopwords.words('english'))
22
+ self.taxonomy = [
23
+ "Artificial Intelligence and Machine Learning",
24
+ "Blockchain and Distributed Ledger",
25
+ "Cloud Computing",
26
+ "Data Analytics and Business Intelligence"
27
+ ]
28
+
29
+ def load_csv(self, filepath):
30
+ df = pd.read_csv(filepath)
31
+ df.columns = df.columns.str.strip().str.lower()
32
+
33
+ if 'title' not in df.columns or 'abstract' not in df.columns:
34
+ raise ValueError("CSV must contain title and abstract")
35
+
36
+ df = df.dropna(subset=['title', 'abstract'])
37
+ return df
38
+
39
+ def clean_text(self, text):
40
+ text = text.lower()
41
+ text = re.sub(r'[^a-z\s]', ' ', text)
42
+ tokens = word_tokenize(text)
43
+ tokens = [self.lemmatizer.lemmatize(t) for t in tokens if t not in self.stop_words]
44
+ return ' '.join(tokens)
45
+
46
+ def preprocess_corpus(self, df):
47
+ df['combined_clean'] = df['title'].apply(self.clean_text) + " " + df['abstract'].apply(self.clean_text)
48
+ return df
49
+
50
+ def perform_topic_modeling(self, docs, n_topics=100):
51
+ vectorizer = CountVectorizer(stop_words='english')
52
+ X = vectorizer.fit_transform(docs)
53
+
54
+ kmeans = KMeans(n_clusters=n_topics, random_state=42)
55
+ labels = kmeans.fit_predict(X)
56
+
57
+ feature_names = vectorizer.get_feature_names_out()
58
+
59
+ topic_keywords = []
60
+ for i in range(n_topics):
61
+ center = kmeans.cluster_centers_[i]
62
+ top_idx = center.argsort()[::-1][:10]
63
+ words = [feature_names[j] for j in top_idx]
64
+ topic_keywords.append(words)
65
+
66
+ topic_info = pd.DataFrame({
67
+ 'Topic': list(range(n_topics)),
68
+ 'Count': np.bincount(labels, minlength=n_topics)
69
+ })
70
+
71
+ class Model:
72
+ def get_topic(self, i):
73
+ return [(w, 1.0) for w in topic_keywords[i]]
74
+
75
+ def transform(self, docs):
76
+ return labels, None
77
+
78
+ return Model(), topic_info
79
+
80
+ def label_topics(self, model, topic_info):
81
+ data = []
82
+ for tid in topic_info['Topic']:
83
+ words = model.get_topic(tid)
84
+ kw = [w for w, _ in words]
85
+ data.append({
86
+ 'topic_id': tid,
87
+ 'label': ' | '.join(kw[:3]),
88
+ 'keywords': ', '.join(kw)
89
+ })
90
+ return pd.DataFrame(data)
91
+
92
+ def extract_themes(self, labels):
93
+ return list(set(labels))
94
+
95
+ def compare_title_abstract_themes(self, df, model):
96
+ return pd.DataFrame({
97
+ "title_theme": ["sample"],
98
+ "abstract_theme": ["sample"],
99
+ "similarity_score": [0.5]
100
+ })
101
+
102
+ def map_to_taxonomy(self, themes):
103
+ mapped = []
104
+ novel = []
105
+
106
+ for t in themes:
107
+ if "ai" in t.lower():
108
+ mapped.append(f"{t} → Artificial Intelligence and Machine Learning")
109
+ else:
110
+ novel.append(t)
111
+
112
+ return {"mapped": mapped, "novel": novel}
113
+
114
+ def save_outputs(self, comparison_df, taxonomy_map, topic_table):
115
+ comparison_df.to_csv("comparison.csv", index=False)
116
+ topic_table.to_csv("topic_review_table.csv", index=False)
117
+
118
+ with open("taxonomy_map.json", "w") as f:
119
+ json.dump(taxonomy_map, f, indent=2)
120
+
121
+ # 🔴 NEW FUNCTION
122
+ def generate_keywords_csv(self, topic_table, taxonomy_map):
123
+ rows = []
124
+
125
+ mapped_dict = {}
126
+ for item in taxonomy_map["mapped"]:
127
+ parts = item.split(" → ")
128
+ if len(parts) == 2:
129
+ mapped_dict[parts[0]] = parts[1]
130
+
131
+ for _, row in topic_table.iterrows():
132
+ label = row['label']
133
+
134
+ rows.append({
135
+ "ID": row['topic_id'],
136
+ "type": "topic",
137
+ "keywords": row['keywords'],
138
+ "mapped_category": mapped_dict.get(label, "Unknown"),
139
+ "mapping_status": "MAPPED" if label in mapped_dict else "NOVEL",
140
+ "relevance": row['document_count']
141
+ })
142
+
143
+ pd.DataFrame(rows).to_csv("keywords.csv", index=False)
144
+ print("keywords.csv generated")