papercast / processing /paper_discovery.py
batuhanozkose
feat: Add Paper Auto-Discovery (PAD) engine and update documentation
39bbc0e
"""
Paper Auto-Discovery (PAD) Module
Provides intelligent paper search across multiple sources:
- Semantic Scholar Graph API v1
- arXiv API
Aggregates results and provides unified interface for paper discovery.
"""
import requests
import xml.etree.ElementTree as ET
from typing import List, Dict, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed
import logging
logger = logging.getLogger(__name__)
class PaperSearchResult:
"""Represents a single paper search result"""
def __init__(
self,
title: str,
authors: List[str],
year: Optional[int],
abstract: str,
url: str,
pdf_url: Optional[str],
source: str, # "semantic_scholar" or "arxiv"
paper_id: str,
):
self.title = title
self.authors = authors
self.year = year
self.abstract = abstract
self.url = url
self.pdf_url = pdf_url
self.source = source
self.paper_id = paper_id
def to_dict(self) -> Dict:
"""Convert to dictionary for easy JSON serialization"""
return {
"title": self.title,
"authors": self.authors,
"year": self.year,
"abstract": self.abstract,
"url": self.url,
"pdf_url": self.pdf_url,
"source": self.source,
"paper_id": self.paper_id,
}
def __repr__(self):
authors_str = ", ".join(self.authors[:3])
if len(self.authors) > 3:
authors_str += " et al."
return f"<PaperSearchResult: {self.title[:50]}... by {authors_str} ({self.year})>"
class PaperDiscoveryEngine:
"""
PAD - Paper Auto-Discovery Engine
Searches for research papers across multiple sources and returns
unified results with PDF links when available.
"""
SEMANTIC_SCHOLAR_API = "https://api.semanticscholar.org/graph/v1/paper/search"
ARXIV_API = "http://export.arxiv.org/api/query"
def __init__(self, max_results: int = 5):
self.max_results = max_results
self.session = requests.Session()
# Set user agent to avoid 403 errors
self.session.headers.update({
"User-Agent": "PaperCast/1.0 (Research Paper Discovery; batuhan@papercast.io)"
})
def search(self, query: str) -> List[PaperSearchResult]:
"""
Search for papers across all sources in parallel.
Args:
query: Search query (e.g., "diffusion models", "Grok reasoning")
Returns:
List of PaperSearchResult objects, sorted by relevance
"""
logger.info(f"PAD: Searching for '{query}'")
results = []
# Run both API calls in parallel for speed
with ThreadPoolExecutor(max_workers=2) as executor:
future_semantic = executor.submit(self._search_semantic_scholar, query)
future_arxiv = executor.submit(self._search_arxiv, query)
# Collect results as they complete
for future in as_completed([future_semantic, future_arxiv]):
try:
partial_results = future.result()
results.extend(partial_results)
except Exception as e:
logger.error(f"PAD: Search failed for one source: {e}")
# Deduplicate by title (case-insensitive)
seen_titles = set()
unique_results = []
for result in results:
title_lower = result.title.lower().strip()
if title_lower not in seen_titles:
seen_titles.add(title_lower)
unique_results.append(result)
# Limit to max_results
unique_results = unique_results[:self.max_results]
logger.info(f"PAD: Found {len(unique_results)} unique papers")
return unique_results
def _search_semantic_scholar(self, query: str) -> List[PaperSearchResult]:
"""Search Semantic Scholar Graph API v1"""
try:
logger.debug("PAD: Querying Semantic Scholar...")
params = {
"query": query,
"fields": "title,authors,year,abstract,openAccessPdf,url,paperId",
"limit": self.max_results,
}
response = self.session.get(
self.SEMANTIC_SCHOLAR_API,
params=params,
timeout=10
)
# Handle rate limiting gracefully - just skip Semantic Scholar
if response.status_code == 429:
logger.warning("PAD: Semantic Scholar rate limit exceeded (429). Relying on arXiv results.")
return []
response.raise_for_status()
data = response.json()
papers = data.get("data", [])
results = []
for paper in papers:
# Extract PDF URL if available
pdf_url = None
if paper.get("openAccessPdf"):
pdf_url = paper["openAccessPdf"].get("url")
# Extract author names
authors = []
for author in paper.get("authors", []):
if "name" in author:
authors.append(author["name"])
result = PaperSearchResult(
title=paper.get("title", "Untitled"),
authors=authors,
year=paper.get("year"),
abstract=paper.get("abstract", "No abstract available."),
url=paper.get("url", ""),
pdf_url=pdf_url,
source="semantic_scholar",
paper_id=paper.get("paperId", ""),
)
results.append(result)
logger.debug(f"PAD: Semantic Scholar returned {len(results)} papers")
return results
except Exception as e:
logger.error(f"PAD: Semantic Scholar search failed: {e}")
return []
def _search_arxiv(self, query: str) -> List[PaperSearchResult]:
"""Search arXiv API"""
try:
logger.debug("PAD: Querying arXiv...")
params = {
"search_query": f"all:{query}",
"max_results": self.max_results,
"sortBy": "relevance",
"sortOrder": "descending",
}
response = self.session.get(
self.ARXIV_API,
params=params,
timeout=10
)
response.raise_for_status()
# Parse XML response
root = ET.fromstring(response.content)
# Define namespace
ns = {
"atom": "http://www.w3.org/2005/Atom",
"arxiv": "http://arxiv.org/schemas/atom"
}
results = []
for entry in root.findall("atom:entry", ns):
# Extract title
title_elem = entry.find("atom:title", ns)
title = title_elem.text.strip() if title_elem is not None else "Untitled"
# Extract authors
authors = []
for author in entry.findall("atom:author", ns):
name_elem = author.find("atom:name", ns)
if name_elem is not None:
authors.append(name_elem.text.strip())
# Extract abstract
summary_elem = entry.find("atom:summary", ns)
abstract = summary_elem.text.strip() if summary_elem is not None else "No abstract available."
# Extract URL (abstract page)
url_elem = entry.find("atom:id", ns)
url = url_elem.text.strip() if url_elem is not None else ""
# Extract PDF URL
pdf_url = None
for link in entry.findall("atom:link", ns):
if link.get("type") == "application/pdf":
pdf_url = link.get("href")
break
# Extract year from published date
published_elem = entry.find("atom:published", ns)
year = None
if published_elem is not None:
try:
year = int(published_elem.text[:4])
except (ValueError, TypeError):
pass
# Extract arXiv ID
paper_id = url.split("/")[-1] if url else ""
result = PaperSearchResult(
title=title,
authors=authors,
year=year,
abstract=abstract,
url=url,
pdf_url=pdf_url,
source="arxiv",
paper_id=paper_id,
)
results.append(result)
logger.debug(f"PAD: arXiv returned {len(results)} papers")
return results
except Exception as e:
logger.error(f"PAD: arXiv search failed: {e}")
return []
def get_pdf_url(self, result: PaperSearchResult) -> Optional[str]:
"""
Get the best available PDF URL for a search result.
Returns direct PDF URL if available, otherwise returns the paper URL
which can be processed by the existing fetching logic.
"""
if result.pdf_url:
return result.pdf_url
# For arXiv papers without direct PDF link, construct it
if result.source == "arxiv" and result.url:
# Convert abstract URL to PDF URL
# https://arxiv.org/abs/2301.12345 -> https://arxiv.org/pdf/2301.12345.pdf
return result.url.replace("/abs/", "/pdf/") + ".pdf"
# Return the paper URL as fallback (existing logic can handle it)
return result.url
# Convenience function for easy import
def search_papers(query: str, max_results: int = 5) -> List[PaperSearchResult]:
"""
Search for research papers across multiple sources.
Args:
query: Search query (e.g., "diffusion models", "Grok reasoning")
max_results: Maximum number of results to return (default: 5)
Returns:
List of PaperSearchResult objects
Example:
>>> results = search_papers("transformer attention mechanisms")
>>> for paper in results:
>>> print(f"{paper.title} ({paper.year})")
>>> print(f" PDF: {paper.pdf_url}")
"""
engine = PaperDiscoveryEngine(max_results=max_results)
return engine.search(query)