ferferefer commited on
Commit
e5a9b78
1 Parent(s): 24d5b7c
Files changed (2) hide show
  1. requirements.txt +1 -1
  2. research_agent.py +124 -106
requirements.txt CHANGED
@@ -3,7 +3,7 @@ markdown==3.6
3
  pyautogen==0.2.25
4
  beautifulsoup4==4.12.3
5
  requests==2.31.0
6
- googlesearch-python==1.2.3
7
  langchain==0.1.12
8
  chromadb==0.4.24
9
  sentence-transformers==2.5.1
 
3
  pyautogen==0.2.25
4
  beautifulsoup4==4.12.3
5
  requests==2.31.0
6
+ scholarly==1.7.11
7
  langchain==0.1.12
8
  chromadb==0.4.24
9
  sentence-transformers==2.5.1
research_agent.py CHANGED
@@ -9,6 +9,7 @@ from langchain.chains import RetrievalQA
9
  from langchain.llms import OpenAI
10
  import urllib.parse
11
  import time
 
12
 
13
  class ResearchAgent:
14
  def __init__(self, openai_api_key: str):
@@ -19,131 +20,148 @@ class ResearchAgent:
19
  'Connection': 'keep-alive',
20
  }
21
  self.openai_api_key = openai_api_key
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  def extract_keywords(self, question: str) -> str:
24
  llm = OpenAI(api_key=self.openai_api_key)
25
- prompt = f"Extract 3-4 most important medical search terms from this question, provide them as a space-separated list: {question}"
26
  return llm.predict(prompt)
27
 
28
- def search_pubmed(self, keywords: str, num_results: int = 10) -> List[str]:
29
- # Format the search URL for Google
30
- search_query = f"{keywords} site:ncbi.nlm.nih.gov/pmc/articles"
31
- encoded_query = urllib.parse.quote(search_query)
32
- search_url = f"https://www.google.com/search?q={encoded_query}&num={num_results}"
33
-
34
  try:
35
- response = requests.get(search_url, headers=self.headers, timeout=10)
36
- soup = BeautifulSoup(response.text, 'html.parser')
37
-
38
- # Extract URLs from Google search results
39
- search_results = []
40
- for result in soup.find_all('a'):
41
- link = result.get('href', '')
42
- if 'ncbi.nlm.nih.gov/pmc/articles' in link and 'google' not in link:
43
- # Extract actual URL from Google's redirect URL
44
- clean_link = re.search(r'https://www\.ncbi\.nlm\.nih\.gov/pmc/articles/[^&]+', link)
45
- if clean_link:
46
- search_results.append(clean_link.group(0))
47
-
48
- return list(dict.fromkeys(search_results))[:num_results] # Remove duplicates
49
  except Exception as e:
50
- print(f"Error in search: {str(e)}")
51
- return []
 
 
 
 
 
 
 
52
 
53
- def scrape_article(self, url: str) -> Dict[str, str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  try:
55
- response = requests.get(url, headers=self.headers, timeout=10)
56
  soup = BeautifulSoup(response.text, 'html.parser')
57
 
58
  # Try different title selectors
59
- title = None
60
- title_selectors = [
61
- 'h1.content-title',
62
- 'div.content-title',
63
- 'h1#article-title-1',
64
- 'div.article-title',
65
- 'title'
66
- ]
67
-
68
- for selector in title_selectors:
69
- title_elem = soup.select_one(selector)
70
- if title_elem:
71
- title = title_elem.text.strip()
72
- break
73
-
74
- if not title:
75
- title = "Title not found"
76
-
77
- # Try different abstract selectors
78
- abstract = None
79
- abstract_selectors = [
80
- 'div.abstract',
81
- 'div#abstract-1',
82
- 'section.abstract',
83
- 'div[id^="abstract"]',
84
- 'abstract'
85
- ]
86
-
87
- for selector in abstract_selectors:
88
- abstract_elem = soup.select_one(selector)
89
- if abstract_elem:
90
- # Remove any "Abstract" header if present
91
- for header in abstract_elem.find_all(['h2', 'h3', 'h4']):
92
- header.decompose()
93
- abstract = abstract_elem.text.strip()
94
- break
95
-
96
- if not abstract:
97
- # Try to get the first significant paragraph
98
- first_para = soup.find('div', {'class': 'body'})
99
- if first_para:
100
- abstract = first_para.find('p').text.strip()
101
- else:
102
- abstract = "Abstract not found"
103
-
104
- # Extract PMC ID
105
- pmc_id = None
106
- pmc_match = re.search(r'PMC\d+', url)
107
- if pmc_match:
108
- pmc_id = pmc_match.group(0)
109
-
110
- # Extract DOI as backup
111
- doi = None
112
- doi_pattern = r'10\.\d{4,}/[-._;()/:\w]+'
113
- doi_match = re.search(doi_pattern, response.text)
114
- if doi_match:
115
- doi = doi_match.group(0)
116
-
117
- reference_id = pmc_id if pmc_id else (doi if doi else url)
118
-
119
- return {
120
- 'title': title,
121
- 'abstract': abstract,
122
- 'url': url,
123
- 'reference': reference_id
124
- }
125
  except Exception as e:
126
- print(f"Error scraping {url}: {str(e)}")
127
- return None
128
 
129
  def perform_research(self, question: str) -> str:
130
  # Extract keywords
131
  keywords = self.extract_keywords(question)
132
  print(f"Keywords extracted: {keywords}")
133
 
134
- # Search and scrape articles
135
- urls = self.search_pubmed(keywords)
136
- print(f"Found {len(urls)} articles")
137
 
 
138
  articles = []
139
- for url in urls:
140
- article = self.scrape_article(url)
141
- if article and article['abstract'] != "Abstract not found":
142
  articles.append(article)
143
  time.sleep(1) # Polite delay between requests
144
 
145
  if not articles:
146
- return "I apologize, but I couldn't find any relevant articles to answer your question. Please try rephrasing your question or using different terms."
147
 
148
  # Prepare documents for RAG
149
  text_splitter = RecursiveCharacterTextSplitter(
@@ -155,7 +173,7 @@ class ResearchAgent:
155
  for article in articles:
156
  chunks = text_splitter.split_text(article['abstract'])
157
  for chunk in chunks:
158
- texts.append(f"Title: {article['title']}\n\nAbstract: {chunk}\n\nReference: {article['reference']}")
159
 
160
  # Create vector store
161
  embeddings = HuggingFaceEmbeddings()
@@ -170,15 +188,15 @@ class ResearchAgent:
170
  )
171
 
172
  # Get answer with references
173
- result = qa_chain({"query": f"""Based on the provided research articles, please answer this question: {question}
174
- If you can't find a direct answer, summarize the most relevant information from the articles.
175
- Include specific findings and data when available."""})
176
 
177
  answer = result['result']
178
 
179
  # Format response with article summaries
180
- response = f"Answer: {answer}\n\nArticles Referenced:\n\n"
181
  for article in articles:
182
- response += f"Title: {article['title']}\nReference: {article['reference']}\nURL: {article['url']}\n\n"
183
 
184
  return response
 
9
  from langchain.llms import OpenAI
10
  import urllib.parse
11
  import time
12
+ from scholarly import scholarly
13
 
14
  class ResearchAgent:
15
  def __init__(self, openai_api_key: str):
 
20
  'Connection': 'keep-alive',
21
  }
22
  self.openai_api_key = openai_api_key
23
+ self.scientific_domains = [
24
+ 'sciencedirect.com',
25
+ 'springer.com',
26
+ 'nature.com',
27
+ 'ncbi.nlm.nih.gov',
28
+ 'wiley.com',
29
+ 'scielo.org',
30
+ 'frontiersin.org',
31
+ 'mdpi.com',
32
+ 'hindawi.com',
33
+ 'tandfonline.com'
34
+ ]
35
 
36
  def extract_keywords(self, question: str) -> str:
37
  llm = OpenAI(api_key=self.openai_api_key)
38
+ prompt = f"Extract 3-4 most important scientific search terms from this question, provide them as a space-separated list: {question}"
39
  return llm.predict(prompt)
40
 
41
+ def search_papers(self, keywords: str, num_results: int = 10) -> List[Dict]:
42
+ # First try Google Scholar
 
 
 
 
43
  try:
44
+ search_query = scholarly.search_pubs(keywords)
45
+ papers = []
46
+ for i in range(num_results):
47
+ try:
48
+ paper = next(search_query)
49
+ if paper.get('pub_url'):
50
+ papers.append({
51
+ 'title': paper.get('bib', {}).get('title', ''),
52
+ 'url': paper.get('pub_url'),
53
+ 'abstract': paper.get('bib', {}).get('abstract', '')
54
+ })
55
+ except StopIteration:
56
+ break
 
57
  except Exception as e:
58
+ print(f"Scholar search failed: {str(e)}")
59
+ papers = []
60
+
61
+ # Fallback to regular Google search if needed
62
+ if len(papers) < num_results:
63
+ remaining = num_results - len(papers)
64
+ search_query = f"{keywords} filetype:pdf site:({' OR site:'.join(self.scientific_domains)})"
65
+ encoded_query = urllib.parse.quote(search_query)
66
+ search_url = f"https://www.google.com/search?q={encoded_query}&num={remaining*2}" # Get more results as some might fail
67
 
68
+ try:
69
+ response = requests.get(search_url, headers=self.headers, timeout=10)
70
+ soup = BeautifulSoup(response.text, 'html.parser')
71
+
72
+ for result in soup.find_all('a'):
73
+ if len(papers) >= num_results:
74
+ break
75
+
76
+ link = result.get('href', '')
77
+ if any(domain in link for domain in self.scientific_domains):
78
+ clean_link = re.search(r'https?://[^&]+', link)
79
+ if clean_link:
80
+ papers.append({
81
+ 'url': clean_link.group(0),
82
+ 'title': '', # Will be filled during scraping
83
+ 'abstract': '' # Will be filled during scraping
84
+ })
85
+
86
+ except Exception as e:
87
+ print(f"Google search failed: {str(e)}")
88
+
89
+ return papers
90
+
91
+ def scrape_paper(self, paper: Dict) -> Dict[str, str]:
92
  try:
93
+ response = requests.get(paper['url'], headers=self.headers, timeout=15)
94
  soup = BeautifulSoup(response.text, 'html.parser')
95
 
96
  # Try different title selectors
97
+ if not paper['title']:
98
+ title_selectors = [
99
+ 'h1',
100
+ 'h1.article-title',
101
+ 'h1.title',
102
+ 'div.article-title',
103
+ 'meta[name="citation_title"]'
104
+ ]
105
+
106
+ for selector in title_selectors:
107
+ title_elem = soup.select_one(selector)
108
+ if title_elem:
109
+ paper['title'] = title_elem.get('content', '') or title_elem.text.strip()
110
+ break
111
+
112
+ # Try different abstract/content selectors
113
+ if not paper['abstract']:
114
+ content_selectors = [
115
+ 'div.abstract',
116
+ 'section.abstract',
117
+ 'div#abstract',
118
+ 'meta[name="description"]',
119
+ 'div.paper-content',
120
+ 'div.article-content'
121
+ ]
122
+
123
+ for selector in content_selectors:
124
+ content_elem = soup.select_one(selector)
125
+ if content_elem:
126
+ paper['abstract'] = content_elem.get('content', '') or content_elem.text.strip()
127
+ break
128
+
129
+ if not paper['abstract']:
130
+ # Try to get main content
131
+ paragraphs = soup.find_all('p')
132
+ content = ' '.join([p.text.strip() for p in paragraphs[:5]]) # Get first 5 paragraphs
133
+ if content:
134
+ paper['abstract'] = content
135
+
136
+ # Clean up text
137
+ paper['title'] = paper['title'] or "Title not found"
138
+ paper['abstract'] = paper['abstract'] or "Content not found"
139
+ paper['abstract'] = re.sub(r'\s+', ' ', paper['abstract'])
140
+
141
+ return paper
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  except Exception as e:
143
+ print(f"Error scraping {paper['url']}: {str(e)}")
144
+ return paper
145
 
146
  def perform_research(self, question: str) -> str:
147
  # Extract keywords
148
  keywords = self.extract_keywords(question)
149
  print(f"Keywords extracted: {keywords}")
150
 
151
+ # Search for papers
152
+ papers = self.search_papers(keywords)
153
+ print(f"Found {len(papers)} papers")
154
 
155
+ # Scrape full content
156
  articles = []
157
+ for paper in papers:
158
+ article = self.scrape_paper(paper)
159
+ if article and article['abstract'] != "Content not found":
160
  articles.append(article)
161
  time.sleep(1) # Polite delay between requests
162
 
163
  if not articles:
164
+ return "I apologize, but I couldn't find any relevant scientific papers to answer your question. Please try rephrasing your question or using different terms."
165
 
166
  # Prepare documents for RAG
167
  text_splitter = RecursiveCharacterTextSplitter(
 
173
  for article in articles:
174
  chunks = text_splitter.split_text(article['abstract'])
175
  for chunk in chunks:
176
+ texts.append(f"Title: {article['title']}\n\nContent: {chunk}\n\nSource: {article['url']}")
177
 
178
  # Create vector store
179
  embeddings = HuggingFaceEmbeddings()
 
188
  )
189
 
190
  # Get answer with references
191
+ result = qa_chain({"query": f"""Based on the provided scientific papers, please answer this question: {question}
192
+ If you can't find a direct answer, summarize the most relevant information from the papers.
193
+ Include specific findings, data, and methodology when available."""})
194
 
195
  answer = result['result']
196
 
197
  # Format response with article summaries
198
+ response = f"Answer: {answer}\n\nReferences:\n\n"
199
  for article in articles:
200
+ response += f"Title: {article['title']}\nURL: {article['url']}\n\n"
201
 
202
  return response