|
import requests
|
|
import xml.etree.ElementTree as ET
|
|
from scholarly import scholarly
|
|
|
|
class DataLoader:
|
|
def __init__(self):
|
|
print("DataLoader Init")
|
|
def fetch_arxiv_papers(self, query):
|
|
"""
|
|
Fetches top 5 research papers from ArXiv based on the user query.
|
|
If <5 papers are found, expands the search using related topics.
|
|
|
|
Returns:
|
|
list: A list of dictionaries containing paper details (title, summary, link).
|
|
"""
|
|
|
|
def search_arxiv(query):
|
|
"""Helper function to query ArXiv API."""
|
|
url = f"http://export.arxiv.org/api/query?search_query=all:{query}&start=0&max_results=5"
|
|
response = requests.get(url)
|
|
if response.status_code == 200:
|
|
root = ET.fromstring(response.text)
|
|
return [
|
|
{
|
|
"title": entry.find("{http://www.w3.org/2005/Atom}title").text,
|
|
"summary": entry.find("{http://www.w3.org/2005/Atom}summary").text,
|
|
"link": entry.find("{http://www.w3.org/2005/Atom}id").text
|
|
}
|
|
for entry in root.findall("{http://www.w3.org/2005/Atom}entry")
|
|
]
|
|
return []
|
|
|
|
papers = search_arxiv(query)
|
|
|
|
if len(papers) < 5 and self.search_agent:
|
|
related_topics_response = self.search_agent.generate_reply(
|
|
messages=[{"role": "user", "content": f"Suggest 3 related research topics for '{query}'"}]
|
|
)
|
|
related_topics = related_topics_response.get("content", "").split("\n")
|
|
|
|
for topic in related_topics:
|
|
topic = topic.strip()
|
|
if topic and len(papers) < 5:
|
|
new_papers = search_arxiv(topic)
|
|
papers.extend(new_papers)
|
|
papers = papers[:5]
|
|
|
|
return papers
|
|
|
|
def fetch_google_scholar_papers(self, query):
|
|
"""
|
|
Fetches top 5 research papers from Google Scholar.
|
|
Returns:
|
|
list: A list of dictionaries containing paper details (title, summary, link)
|
|
"""
|
|
papers = []
|
|
search_results = scholarly.search_pubs(query)
|
|
|
|
for i, paper in enumerate(search_results):
|
|
if i >= 5:
|
|
break
|
|
papers.append({
|
|
"title": paper["bib"]["title"],
|
|
"summary": paper["bib"].get("abstract", "No summary available"),
|
|
"link": paper.get("pub_url", "No link available")
|
|
})
|
|
return papers
|
|
|