Spaces:
Sleeping
Sleeping
ferferefer
commited on
Commit
•
e5a9b78
1
Parent(s):
24d5b7c
cur
Browse files- requirements.txt +1 -1
- research_agent.py +124 -106
requirements.txt
CHANGED
@@ -3,7 +3,7 @@ markdown==3.6
|
|
3 |
pyautogen==0.2.25
|
4 |
beautifulsoup4==4.12.3
|
5 |
requests==2.31.0
|
6 |
-
|
7 |
langchain==0.1.12
|
8 |
chromadb==0.4.24
|
9 |
sentence-transformers==2.5.1
|
|
|
3 |
pyautogen==0.2.25
|
4 |
beautifulsoup4==4.12.3
|
5 |
requests==2.31.0
|
6 |
+
scholarly==1.7.11
|
7 |
langchain==0.1.12
|
8 |
chromadb==0.4.24
|
9 |
sentence-transformers==2.5.1
|
research_agent.py
CHANGED
@@ -9,6 +9,7 @@ from langchain.chains import RetrievalQA
|
|
9 |
from langchain.llms import OpenAI
|
10 |
import urllib.parse
|
11 |
import time
|
|
|
12 |
|
13 |
class ResearchAgent:
|
14 |
def __init__(self, openai_api_key: str):
|
@@ -19,131 +20,148 @@ class ResearchAgent:
|
|
19 |
'Connection': 'keep-alive',
|
20 |
}
|
21 |
self.openai_api_key = openai_api_key
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
def extract_keywords(self, question: str) -> str:
|
24 |
llm = OpenAI(api_key=self.openai_api_key)
|
25 |
-
prompt = f"Extract 3-4 most important
|
26 |
return llm.predict(prompt)
|
27 |
|
28 |
-
def
|
29 |
-
#
|
30 |
-
search_query = f"{keywords} site:ncbi.nlm.nih.gov/pmc/articles"
|
31 |
-
encoded_query = urllib.parse.quote(search_query)
|
32 |
-
search_url = f"https://www.google.com/search?q={encoded_query}&num={num_results}"
|
33 |
-
|
34 |
try:
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
return list(dict.fromkeys(search_results))[:num_results] # Remove duplicates
|
49 |
except Exception as e:
|
50 |
-
print(f"
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
try:
|
55 |
-
response = requests.get(url, headers=self.headers, timeout=
|
56 |
soup = BeautifulSoup(response.text, 'html.parser')
|
57 |
|
58 |
# Try different title selectors
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
# Extract PMC ID
|
105 |
-
pmc_id = None
|
106 |
-
pmc_match = re.search(r'PMC\d+', url)
|
107 |
-
if pmc_match:
|
108 |
-
pmc_id = pmc_match.group(0)
|
109 |
-
|
110 |
-
# Extract DOI as backup
|
111 |
-
doi = None
|
112 |
-
doi_pattern = r'10\.\d{4,}/[-._;()/:\w]+'
|
113 |
-
doi_match = re.search(doi_pattern, response.text)
|
114 |
-
if doi_match:
|
115 |
-
doi = doi_match.group(0)
|
116 |
-
|
117 |
-
reference_id = pmc_id if pmc_id else (doi if doi else url)
|
118 |
-
|
119 |
-
return {
|
120 |
-
'title': title,
|
121 |
-
'abstract': abstract,
|
122 |
-
'url': url,
|
123 |
-
'reference': reference_id
|
124 |
-
}
|
125 |
except Exception as e:
|
126 |
-
print(f"Error scraping {url}: {str(e)}")
|
127 |
-
return
|
128 |
|
129 |
def perform_research(self, question: str) -> str:
|
130 |
# Extract keywords
|
131 |
keywords = self.extract_keywords(question)
|
132 |
print(f"Keywords extracted: {keywords}")
|
133 |
|
134 |
-
# Search
|
135 |
-
|
136 |
-
print(f"Found {len(
|
137 |
|
|
|
138 |
articles = []
|
139 |
-
for
|
140 |
-
article = self.
|
141 |
-
if article and article['abstract'] != "
|
142 |
articles.append(article)
|
143 |
time.sleep(1) # Polite delay between requests
|
144 |
|
145 |
if not articles:
|
146 |
-
return "I apologize, but I couldn't find any relevant
|
147 |
|
148 |
# Prepare documents for RAG
|
149 |
text_splitter = RecursiveCharacterTextSplitter(
|
@@ -155,7 +173,7 @@ class ResearchAgent:
|
|
155 |
for article in articles:
|
156 |
chunks = text_splitter.split_text(article['abstract'])
|
157 |
for chunk in chunks:
|
158 |
-
texts.append(f"Title: {article['title']}\n\
|
159 |
|
160 |
# Create vector store
|
161 |
embeddings = HuggingFaceEmbeddings()
|
@@ -170,15 +188,15 @@ class ResearchAgent:
|
|
170 |
)
|
171 |
|
172 |
# Get answer with references
|
173 |
-
result = qa_chain({"query": f"""Based on the provided
|
174 |
-
If you can't find a direct answer, summarize the most relevant information from the
|
175 |
-
Include specific findings and
|
176 |
|
177 |
answer = result['result']
|
178 |
|
179 |
# Format response with article summaries
|
180 |
-
response = f"Answer: {answer}\n\
|
181 |
for article in articles:
|
182 |
-
response += f"Title: {article['title']}\
|
183 |
|
184 |
return response
|
|
|
9 |
from langchain.llms import OpenAI
|
10 |
import urllib.parse
|
11 |
import time
|
12 |
+
from scholarly import scholarly
|
13 |
|
14 |
class ResearchAgent:
|
15 |
def __init__(self, openai_api_key: str):
|
|
|
20 |
'Connection': 'keep-alive',
|
21 |
}
|
22 |
self.openai_api_key = openai_api_key
|
23 |
+
self.scientific_domains = [
|
24 |
+
'sciencedirect.com',
|
25 |
+
'springer.com',
|
26 |
+
'nature.com',
|
27 |
+
'ncbi.nlm.nih.gov',
|
28 |
+
'wiley.com',
|
29 |
+
'scielo.org',
|
30 |
+
'frontiersin.org',
|
31 |
+
'mdpi.com',
|
32 |
+
'hindawi.com',
|
33 |
+
'tandfonline.com'
|
34 |
+
]
|
35 |
|
36 |
def extract_keywords(self, question: str) -> str:
|
37 |
llm = OpenAI(api_key=self.openai_api_key)
|
38 |
+
prompt = f"Extract 3-4 most important scientific search terms from this question, provide them as a space-separated list: {question}"
|
39 |
return llm.predict(prompt)
|
40 |
|
41 |
+
def search_papers(self, keywords: str, num_results: int = 10) -> List[Dict]:
|
42 |
+
# First try Google Scholar
|
|
|
|
|
|
|
|
|
43 |
try:
|
44 |
+
search_query = scholarly.search_pubs(keywords)
|
45 |
+
papers = []
|
46 |
+
for i in range(num_results):
|
47 |
+
try:
|
48 |
+
paper = next(search_query)
|
49 |
+
if paper.get('pub_url'):
|
50 |
+
papers.append({
|
51 |
+
'title': paper.get('bib', {}).get('title', ''),
|
52 |
+
'url': paper.get('pub_url'),
|
53 |
+
'abstract': paper.get('bib', {}).get('abstract', '')
|
54 |
+
})
|
55 |
+
except StopIteration:
|
56 |
+
break
|
|
|
57 |
except Exception as e:
|
58 |
+
print(f"Scholar search failed: {str(e)}")
|
59 |
+
papers = []
|
60 |
+
|
61 |
+
# Fallback to regular Google search if needed
|
62 |
+
if len(papers) < num_results:
|
63 |
+
remaining = num_results - len(papers)
|
64 |
+
search_query = f"{keywords} filetype:pdf site:({' OR site:'.join(self.scientific_domains)})"
|
65 |
+
encoded_query = urllib.parse.quote(search_query)
|
66 |
+
search_url = f"https://www.google.com/search?q={encoded_query}&num={remaining*2}" # Get more results as some might fail
|
67 |
|
68 |
+
try:
|
69 |
+
response = requests.get(search_url, headers=self.headers, timeout=10)
|
70 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
71 |
+
|
72 |
+
for result in soup.find_all('a'):
|
73 |
+
if len(papers) >= num_results:
|
74 |
+
break
|
75 |
+
|
76 |
+
link = result.get('href', '')
|
77 |
+
if any(domain in link for domain in self.scientific_domains):
|
78 |
+
clean_link = re.search(r'https?://[^&]+', link)
|
79 |
+
if clean_link:
|
80 |
+
papers.append({
|
81 |
+
'url': clean_link.group(0),
|
82 |
+
'title': '', # Will be filled during scraping
|
83 |
+
'abstract': '' # Will be filled during scraping
|
84 |
+
})
|
85 |
+
|
86 |
+
except Exception as e:
|
87 |
+
print(f"Google search failed: {str(e)}")
|
88 |
+
|
89 |
+
return papers
|
90 |
+
|
91 |
+
def scrape_paper(self, paper: Dict) -> Dict[str, str]:
|
92 |
try:
|
93 |
+
response = requests.get(paper['url'], headers=self.headers, timeout=15)
|
94 |
soup = BeautifulSoup(response.text, 'html.parser')
|
95 |
|
96 |
# Try different title selectors
|
97 |
+
if not paper['title']:
|
98 |
+
title_selectors = [
|
99 |
+
'h1',
|
100 |
+
'h1.article-title',
|
101 |
+
'h1.title',
|
102 |
+
'div.article-title',
|
103 |
+
'meta[name="citation_title"]'
|
104 |
+
]
|
105 |
+
|
106 |
+
for selector in title_selectors:
|
107 |
+
title_elem = soup.select_one(selector)
|
108 |
+
if title_elem:
|
109 |
+
paper['title'] = title_elem.get('content', '') or title_elem.text.strip()
|
110 |
+
break
|
111 |
+
|
112 |
+
# Try different abstract/content selectors
|
113 |
+
if not paper['abstract']:
|
114 |
+
content_selectors = [
|
115 |
+
'div.abstract',
|
116 |
+
'section.abstract',
|
117 |
+
'div#abstract',
|
118 |
+
'meta[name="description"]',
|
119 |
+
'div.paper-content',
|
120 |
+
'div.article-content'
|
121 |
+
]
|
122 |
+
|
123 |
+
for selector in content_selectors:
|
124 |
+
content_elem = soup.select_one(selector)
|
125 |
+
if content_elem:
|
126 |
+
paper['abstract'] = content_elem.get('content', '') or content_elem.text.strip()
|
127 |
+
break
|
128 |
+
|
129 |
+
if not paper['abstract']:
|
130 |
+
# Try to get main content
|
131 |
+
paragraphs = soup.find_all('p')
|
132 |
+
content = ' '.join([p.text.strip() for p in paragraphs[:5]]) # Get first 5 paragraphs
|
133 |
+
if content:
|
134 |
+
paper['abstract'] = content
|
135 |
+
|
136 |
+
# Clean up text
|
137 |
+
paper['title'] = paper['title'] or "Title not found"
|
138 |
+
paper['abstract'] = paper['abstract'] or "Content not found"
|
139 |
+
paper['abstract'] = re.sub(r'\s+', ' ', paper['abstract'])
|
140 |
+
|
141 |
+
return paper
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
except Exception as e:
|
143 |
+
print(f"Error scraping {paper['url']}: {str(e)}")
|
144 |
+
return paper
|
145 |
|
146 |
def perform_research(self, question: str) -> str:
|
147 |
# Extract keywords
|
148 |
keywords = self.extract_keywords(question)
|
149 |
print(f"Keywords extracted: {keywords}")
|
150 |
|
151 |
+
# Search for papers
|
152 |
+
papers = self.search_papers(keywords)
|
153 |
+
print(f"Found {len(papers)} papers")
|
154 |
|
155 |
+
# Scrape full content
|
156 |
articles = []
|
157 |
+
for paper in papers:
|
158 |
+
article = self.scrape_paper(paper)
|
159 |
+
if article and article['abstract'] != "Content not found":
|
160 |
articles.append(article)
|
161 |
time.sleep(1) # Polite delay between requests
|
162 |
|
163 |
if not articles:
|
164 |
+
return "I apologize, but I couldn't find any relevant scientific papers to answer your question. Please try rephrasing your question or using different terms."
|
165 |
|
166 |
# Prepare documents for RAG
|
167 |
text_splitter = RecursiveCharacterTextSplitter(
|
|
|
173 |
for article in articles:
|
174 |
chunks = text_splitter.split_text(article['abstract'])
|
175 |
for chunk in chunks:
|
176 |
+
texts.append(f"Title: {article['title']}\n\nContent: {chunk}\n\nSource: {article['url']}")
|
177 |
|
178 |
# Create vector store
|
179 |
embeddings = HuggingFaceEmbeddings()
|
|
|
188 |
)
|
189 |
|
190 |
# Get answer with references
|
191 |
+
result = qa_chain({"query": f"""Based on the provided scientific papers, please answer this question: {question}
|
192 |
+
If you can't find a direct answer, summarize the most relevant information from the papers.
|
193 |
+
Include specific findings, data, and methodology when available."""})
|
194 |
|
195 |
answer = result['result']
|
196 |
|
197 |
# Format response with article summaries
|
198 |
+
response = f"Answer: {answer}\n\nReferences:\n\n"
|
199 |
for article in articles:
|
200 |
+
response += f"Title: {article['title']}\nURL: {article['url']}\n\n"
|
201 |
|
202 |
return response
|