Agents_AI

Sleeping

App Files Files Community

ferferefer commited on 13 days ago

Commit

e5a9b78

•

1 Parent(s): 24d5b7c

cur

Browse files

Files changed (2) hide show

requirements.txt +1 -1
research_agent.py +124 -106

requirements.txt CHANGED Viewed

@@ -3,7 +3,7 @@ markdown==3.6
 pyautogen==0.2.25
 beautifulsoup4==4.12.3
 requests==2.31.0
-googlesearch-python==1.2.3
 langchain==0.1.12
 chromadb==0.4.24
 sentence-transformers==2.5.1

 pyautogen==0.2.25
 beautifulsoup4==4.12.3
 requests==2.31.0
+scholarly==1.7.11
 langchain==0.1.12
 chromadb==0.4.24
 sentence-transformers==2.5.1

research_agent.py CHANGED Viewed

@@ -9,6 +9,7 @@ from langchain.chains import RetrievalQA
 from langchain.llms import OpenAI
 import urllib.parse
 import time
 class ResearchAgent:
     def __init__(self, openai_api_key: str):
@@ -19,131 +20,148 @@ class ResearchAgent:
             'Connection': 'keep-alive',
         }
         self.openai_api_key = openai_api_key
     def extract_keywords(self, question: str) -> str:
         llm = OpenAI(api_key=self.openai_api_key)
-        prompt = f"Extract 3-4 most important medical search terms from this question, provide them as a space-separated list: {question}"
         return llm.predict(prompt)
-    def search_pubmed(self, keywords: str, num_results: int = 10) -> List[str]:
-        # Format the search URL for Google
-        search_query = f"{keywords} site:ncbi.nlm.nih.gov/pmc/articles"
-        encoded_query = urllib.parse.quote(search_query)
-        search_url = f"https://www.google.com/search?q={encoded_query}&num={num_results}"
         try:
-            response = requests.get(search_url, headers=self.headers, timeout=10)
-            soup = BeautifulSoup(response.text, 'html.parser')
-            # Extract URLs from Google search results
-            search_results = []
-            for result in soup.find_all('a'):
-                link = result.get('href', '')
-                if 'ncbi.nlm.nih.gov/pmc/articles' in link and 'google' not in link:
-                    # Extract actual URL from Google's redirect URL
-                    clean_link = re.search(r'https://www\.ncbi\.nlm\.nih\.gov/pmc/articles/[^&]+', link)
-                    if clean_link:
-                        search_results.append(clean_link.group(0))
-            return list(dict.fromkeys(search_results))[:num_results]  # Remove duplicates
         except Exception as e:
-            print(f"Error in search: {str(e)}")
-            return []
-    def scrape_article(self, url: str) -> Dict[str, str]:
         try:
-            response = requests.get(url, headers=self.headers, timeout=10)
             soup = BeautifulSoup(response.text, 'html.parser')
             # Try different title selectors
-            title = None
-            title_selectors = [
-                'h1.content-title',
-                'div.content-title',
-                'h1#article-title-1',
-                'div.article-title',
-                'title'
-            ]
-            for selector in title_selectors:
-                title_elem = soup.select_one(selector)
-                if title_elem:
-                    title = title_elem.text.strip()
-                    break
-            if not title:
-                title = "Title not found"
-            # Try different abstract selectors
-            abstract = None
-            abstract_selectors = [
-                'div.abstract',
-                'div#abstract-1',
-                'section.abstract',
-                'div[id^="abstract"]',
-                'abstract'
-            ]
-            for selector in abstract_selectors:
-                abstract_elem = soup.select_one(selector)
-                if abstract_elem:
-                    # Remove any "Abstract" header if present
-                    for header in abstract_elem.find_all(['h2', 'h3', 'h4']):
-                        header.decompose()
-                    abstract = abstract_elem.text.strip()
-                    break
-            if not abstract:
-                # Try to get the first significant paragraph
-                first_para = soup.find('div', {'class': 'body'})
-                if first_para:
-                    abstract = first_para.find('p').text.strip()
-                else:
-                    abstract = "Abstract not found"
-            # Extract PMC ID
-            pmc_id = None
-            pmc_match = re.search(r'PMC\d+', url)
-            if pmc_match:
-                pmc_id = pmc_match.group(0)
-            # Extract DOI as backup
-            doi = None
-            doi_pattern = r'10\.\d{4,}/[-._;()/:\w]+'
-            doi_match = re.search(doi_pattern, response.text)
-            if doi_match:
-                doi = doi_match.group(0)
-            reference_id = pmc_id if pmc_id else (doi if doi else url)
-            return {
-                'title': title,
-                'abstract': abstract,
-                'url': url,
-                'reference': reference_id
-            }
         except Exception as e:
-            print(f"Error scraping {url}: {str(e)}")
-            return None
     def perform_research(self, question: str) -> str:
         # Extract keywords
         keywords = self.extract_keywords(question)
         print(f"Keywords extracted: {keywords}")
-        # Search and scrape articles
-        urls = self.search_pubmed(keywords)
-        print(f"Found {len(urls)} articles")
         articles = []
-        for url in urls:
-            article = self.scrape_article(url)
-            if article and article['abstract'] != "Abstract not found":
                 articles.append(article)
             time.sleep(1)  # Polite delay between requests
         if not articles:
-            return "I apologize, but I couldn't find any relevant articles to answer your question. Please try rephrasing your question or using different terms."
         # Prepare documents for RAG
         text_splitter = RecursiveCharacterTextSplitter(
@@ -155,7 +173,7 @@ class ResearchAgent:
         for article in articles:
             chunks = text_splitter.split_text(article['abstract'])
             for chunk in chunks:
-                texts.append(f"Title: {article['title']}\n\nAbstract: {chunk}\n\nReference: {article['reference']}")
         # Create vector store
         embeddings = HuggingFaceEmbeddings()
@@ -170,15 +188,15 @@ class ResearchAgent:
         )
         # Get answer with references
-        result = qa_chain({"query": f"""Based on the provided research articles, please answer this question: {question}
-                         If you can't find a direct answer, summarize the most relevant information from the articles.
-                         Include specific findings and data when available."""})
         answer = result['result']
         # Format response with article summaries
-        response = f"Answer: {answer}\n\nArticles Referenced:\n\n"
         for article in articles:
-            response += f"Title: {article['title']}\nReference: {article['reference']}\nURL: {article['url']}\n\n"
         return response

 from langchain.llms import OpenAI
 import urllib.parse
 import time
+from scholarly import scholarly
 class ResearchAgent:
     def __init__(self, openai_api_key: str):
             'Connection': 'keep-alive',
         }
         self.openai_api_key = openai_api_key
+        self.scientific_domains = [
+            'sciencedirect.com',
+            'springer.com',
+            'nature.com',
+            'ncbi.nlm.nih.gov',
+            'wiley.com',
+            'scielo.org',
+            'frontiersin.org',
+            'mdpi.com',
+            'hindawi.com',
+            'tandfonline.com'
+        ]
     def extract_keywords(self, question: str) -> str:
         llm = OpenAI(api_key=self.openai_api_key)
+        prompt = f"Extract 3-4 most important scientific search terms from this question, provide them as a space-separated list: {question}"
         return llm.predict(prompt)
+    def search_papers(self, keywords: str, num_results: int = 10) -> List[Dict]:
+        # First try Google Scholar
         try:
+            search_query = scholarly.search_pubs(keywords)
+            papers = []
+            for i in range(num_results):
+                try:
+                    paper = next(search_query)
+                    if paper.get('pub_url'):
+                        papers.append({
+                            'title': paper.get('bib', {}).get('title', ''),
+                            'url': paper.get('pub_url'),
+                            'abstract': paper.get('bib', {}).get('abstract', '')
+                        })
+                except StopIteration:
+                    break
         except Exception as e:
+            print(f"Scholar search failed: {str(e)}")
+            papers = []
+        # Fallback to regular Google search if needed
+        if len(papers) < num_results:
+            remaining = num_results - len(papers)
+            search_query = f"{keywords} filetype:pdf site:({' OR site:'.join(self.scientific_domains)})"
+            encoded_query = urllib.parse.quote(search_query)
+            search_url = f"https://www.google.com/search?q={encoded_query}&num={remaining*2}"  # Get more results as some might fail
+            try:
+                response = requests.get(search_url, headers=self.headers, timeout=10)
+                soup = BeautifulSoup(response.text, 'html.parser')
+                for result in soup.find_all('a'):
+                    if len(papers) >= num_results:
+                        break
+                    link = result.get('href', '')
+                    if any(domain in link for domain in self.scientific_domains):
+                        clean_link = re.search(r'https?://[^&]+', link)
+                        if clean_link:
+                            papers.append({
+                                'url': clean_link.group(0),
+                                'title': '',  # Will be filled during scraping
+                                'abstract': ''  # Will be filled during scraping
+                            })
+            except Exception as e:
+                print(f"Google search failed: {str(e)}")
+        return papers
+    def scrape_paper(self, paper: Dict) -> Dict[str, str]:
         try:
+            response = requests.get(paper['url'], headers=self.headers, timeout=15)
             soup = BeautifulSoup(response.text, 'html.parser')
             # Try different title selectors
+            if not paper['title']:
+                title_selectors = [
+                    'h1',
+                    'h1.article-title',
+                    'h1.title',
+                    'div.article-title',
+                    'meta[name="citation_title"]'
+                ]
+                for selector in title_selectors:
+                    title_elem = soup.select_one(selector)
+                    if title_elem:
+                        paper['title'] = title_elem.get('content', '') or title_elem.text.strip()
+                        break
+            # Try different abstract/content selectors
+            if not paper['abstract']:
+                content_selectors = [
+                    'div.abstract',
+                    'section.abstract',
+                    'div#abstract',
+                    'meta[name="description"]',
+                    'div.paper-content',
+                    'div.article-content'
+                ]
+                for selector in content_selectors:
+                    content_elem = soup.select_one(selector)
+                    if content_elem:
+                        paper['abstract'] = content_elem.get('content', '') or content_elem.text.strip()
+                        break
+                if not paper['abstract']:
+                    # Try to get main content
+                    paragraphs = soup.find_all('p')
+                    content = ' '.join([p.text.strip() for p in paragraphs[:5]])  # Get first 5 paragraphs
+                    if content:
+                        paper['abstract'] = content
+            # Clean up text
+            paper['title'] = paper['title'] or "Title not found"
+            paper['abstract'] = paper['abstract'] or "Content not found"
+            paper['abstract'] = re.sub(r'\s+', ' ', paper['abstract'])
+            return paper
         except Exception as e:
+            print(f"Error scraping {paper['url']}: {str(e)}")
+            return paper
     def perform_research(self, question: str) -> str:
         # Extract keywords
         keywords = self.extract_keywords(question)
         print(f"Keywords extracted: {keywords}")
+        # Search for papers
+        papers = self.search_papers(keywords)
+        print(f"Found {len(papers)} papers")
+        # Scrape full content
         articles = []
+        for paper in papers:
+            article = self.scrape_paper(paper)
+            if article and article['abstract'] != "Content not found":
                 articles.append(article)
             time.sleep(1)  # Polite delay between requests
         if not articles:
+            return "I apologize, but I couldn't find any relevant scientific papers to answer your question. Please try rephrasing your question or using different terms."
         # Prepare documents for RAG
         text_splitter = RecursiveCharacterTextSplitter(
         for article in articles:
             chunks = text_splitter.split_text(article['abstract'])
             for chunk in chunks:
+                texts.append(f"Title: {article['title']}\n\nContent: {chunk}\n\nSource: {article['url']}")
         # Create vector store
         embeddings = HuggingFaceEmbeddings()
         )
         # Get answer with references
+        result = qa_chain({"query": f"""Based on the provided scientific papers, please answer this question: {question}
+                         If you can't find a direct answer, summarize the most relevant information from the papers.
+                         Include specific findings, data, and methodology when available."""})
         answer = result['result']
         # Format response with article summaries
+        response = f"Answer: {answer}\n\nReferences:\n\n"
         for article in articles:
+            response += f"Title: {article['title']}\nURL: {article['url']}\n\n"
         return response